summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/affs/affs.h4
-rw-r--r--fs/affs/amigaffs.h144
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c10
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c87
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c1
-rw-r--r--fs/afs/inode.c7
-rw-r--r--fs/afs/internal.h16
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/mntpt.c1
-rw-r--r--fs/afs/rxrpc.c18
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/xattr.c121
-rw-r--r--fs/aio.c15
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/bfs/inode.c4
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/block_dev.c97
-rw-r--r--fs/btrfs/acl.c13
-rw-r--r--fs/btrfs/backref.c51
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/check-integrity.c57
-rw-r--r--fs/btrfs/compression.c154
-rw-r--r--fs/btrfs/compression.h48
-rw-r--r--fs/btrfs/ctree.c71
-rw-r--r--fs/btrfs/ctree.h128
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/delayed-inode.h6
-rw-r--r--fs/btrfs/delayed-ref.c37
-rw-r--r--fs/btrfs/delayed-ref.h14
-rw-r--r--fs/btrfs/dev-replace.c13
-rw-r--r--fs/btrfs/dir-item.c78
-rw-r--r--fs/btrfs/disk-io.c268
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c531
-rw-r--r--fs/btrfs/extent_io.c429
-rw-r--r--fs/btrfs/extent_io.h90
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c45
-rw-r--r--fs/btrfs/file.c174
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/free-space-tree.c39
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c814
-rw-r--r--fs/btrfs/ioctl.c60
-rw-r--r--fs/btrfs/lzo.c33
-rw-r--r--fs/btrfs/ordered-data.c37
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/props.c7
-rw-r--r--fs/btrfs/qgroup.c323
-rw-r--r--fs/btrfs/qgroup.h60
-rw-r--r--fs/btrfs/raid56.c70
-rw-r--r--fs/btrfs/reada.c38
-rw-r--r--fs/btrfs/relocation.c17
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/scrub.c568
-rw-r--r--fs/btrfs/send.c162
-rw-r--r--fs/btrfs/super.c77
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c2
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c46
-rw-r--r--fs/btrfs/volumes.c939
-rw-r--r--fs/btrfs/volumes.h15
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c20
-rw-r--r--fs/buffer.c189
-rw-r--r--fs/cachefiles/internal.h4
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/ceph/acl.c1
-rw-r--r--fs/ceph/addr.c10
-rw-r--r--fs/ceph/caps.c25
-rw-r--r--fs/ceph/debugfs.c21
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c83
-rw-r--r--fs/ceph/inode.c22
-rw-r--r--fs/ceph/mds_client.c77
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/mdsmap.c44
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h31
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig9
-rw-r--r--fs/cifs/cifs_unicode.c8
-rw-r--r--fs/cifs/cifsacl.c30
-rw-r--r--fs/cifs/cifsencrypt.c4
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c21
-rw-r--r--fs/cifs/file.c20
-rw-r--r--fs/cifs/inode.c32
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/smb1ops.c9
-rw-r--r--fs/cifs/smb2ops.c127
-rw-r--r--fs/cifs/smb2pdu.c73
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/smb2transport.c28
-rw-r--r--fs/cifs/transport.c11
-rw-r--r--fs/cifs/xattr.c8
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/compat_ioctl.c27
-rw-r--r--fs/configfs/item.c8
-rw-r--r--fs/configfs/symlink.c3
-rw-r--r--fs/crypto/Kconfig1
-rw-r--r--fs/crypto/bio.c2
-rw-r--r--fs/crypto/crypto.c23
-rw-r--r--fs/crypto/fname.c9
-rw-r--r--fs/crypto/fscrypt_private.h9
-rw-r--r--fs/crypto/keyinfo.c173
-rw-r--r--fs/crypto/policy.c9
-rw-r--r--fs/dax.c176
-rw-r--r--fs/dcache.c64
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c14
-rw-r--r--fs/direct-io.c25
-rw-r--r--fs/eventfd.c6
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c35
-rw-r--r--fs/exofs/dir.c5
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext2/super.c17
-rw-r--r--fs/ext2/xattr.c48
-rw-r--r--fs/ext4/acl.c25
-rw-r--r--fs/ext4/ext4.h65
-rw-r--r--fs/ext4/ext4_jbd2.h23
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/file.c117
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c76
-rw-r--r--fs/ext4/indirect.c3
-rw-r--r--fs/ext4/inline.c7
-rw-r--r--fs/ext4/inode.c126
-rw-r--r--fs/ext4/ioctl.c10
-rw-r--r--fs/ext4/mballoc.c170
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c144
-rw-r--r--fs/ext4/page-io.c15
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/super.c135
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/ext4/xattr.c1707
-rw-r--r--fs/ext4/xattr.h35
-rw-r--r--fs/f2fs/data.c10
-rw-r--r--fs/f2fs/f2fs.h25
-rw-r--r--fs/f2fs/file.c4
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/segment.c18
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c2
-rw-r--r--fs/fcntl.c318
-rw-r--r--fs/file.c22
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs-writeback.c12
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dev.c24
-rw-r--r--fs/fuse/file.c32
-rw-r--r--fs/fuse/fuse_i.h11
-rw-r--r--fs/fuse/inode.c18
-rw-r--r--fs/gfs2/bmap.c6
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/glock.c135
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c19
-rw-r--r--fs/gfs2/log.c5
-rw-r--r--fs/gfs2/lops.c18
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/rgrp.c6
-rw-r--r--fs/gfs2/super.c33
-rw-r--r--fs/gfs2/sys.c26
-rw-r--r--fs/gfs2/xattr.c4
-rw-r--r--fs/hfs/extent.c4
-rw-r--r--fs/hfsplus/extents.c5
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c30
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap.c120
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c48
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_metapage.c11
-rw-r--r--fs/jfs/jfs_metapage.h1
-rw-r--r--fs/libfs.c8
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c26
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/locks.c99
-rw-r--r--fs/mbcache.c52
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/itree_common.c2
-rw-r--r--fs/mount.h1
-rw-r--r--fs/mpage.c5
-rw-r--r--fs/namei.c61
-rw-r--r--fs/namespace.c31
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/blocklayout.c4
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/callback_proc.c47
-rw-r--r--fs/nfs/callback_xdr.c110
-rw-r--r--fs/nfs/client.c67
-rw-r--r--fs/nfs/dir.c155
-rw-r--r--fs/nfs/direct.c21
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/filelayout/filelayout.c8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c25
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c10
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/namespace.c34
-rw-r--r--fs/nfs/nfs3proc.c54
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c22
-rw-r--r--fs/nfs/nfs4client.c282
-rw-r--r--fs/nfs/nfs4getroot.c3
-rw-r--r--fs/nfs/nfs4namespace.c7
-rw-r--r--fs/nfs/nfs4proc.c108
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4xdr.c94
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c675
-rw-r--r--fs/nfs/objlayout/objlayout.c706
-rw-r--r--fs/nfs/objlayout/objlayout.h183
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c415
-rw-r--r--fs/nfs/pagelist.c77
-rw-r--r--fs/nfs/pnfs.c87
-rw-r--r--fs/nfs/pnfs.h16
-rw-r--r--fs/nfs/pnfs_nfs.c24
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c22
-rw-r--r--fs/nfs/write.c121
-rw-r--r--fs/nfsd/blocklayout.c4
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/nfs4proc.c16
-rw-r--r--fs/nfsd/nfs4state.c25
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/vfs.c97
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/notify/fsnotify.c8
-rw-r--r--fs/nsfs.c4
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/netdebug.c1
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/ocfs2_fs.h5
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ocfs2/xattr.c23
-rw-r--r--fs/open.c22
-rw-r--r--fs/orangefs/orangefs-bufmap.c12
-rw-r--r--fs/overlayfs/Kconfig1
-rw-r--r--fs/overlayfs/copy_up.c132
-rw-r--r--fs/overlayfs/dir.c88
-rw-r--r--fs/overlayfs/inode.c115
-rw-r--r--fs/overlayfs/namei.c151
-rw-r--r--fs/overlayfs/overlayfs.h57
-rw-r--r--fs/overlayfs/ovl_entry.h4
-rw-r--r--fs/overlayfs/super.c58
-rw-r--r--fs/overlayfs/util.c91
-rw-r--r--fs/pnode.c212
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/pstore/inode.c22
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c69
-rw-r--r--fs/pstore/pmsg.c10
-rw-r--r--fs/pstore/ram.c18
-rw-r--r--fs/quota/dquot.c32
-rw-r--r--fs/read_write.c234
-rw-r--r--fs/reiserfs/item_ops.c24
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/select.c53
-rw-r--r--fs/seq_file.c16
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/splice.c2
-rw-r--r--fs/stat.c1
-rw-r--r--fs/statfs.c60
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/timerfd.c43
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/Kconfig13
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c12
-rw-r--r--fs/ubifs/file.c12
-rw-r--r--fs/ubifs/ioctl.c8
-rw-r--r--fs/ubifs/misc.h10
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c14
-rw-r--r--fs/ubifs/ubifs.h14
-rw-r--r--fs/ubifs/xattr.c12
-rw-r--r--fs/ufs/balloc.c70
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/ufs/inode.c96
-rw-r--r--fs/ufs/super.c96
-rw-r--r--fs/ufs/ufs_fs.h9
-rw-r--r--fs/ufs/util.c17
-rw-r--r--fs/ufs/util.h19
-rw-r--r--fs/userfaultfd.c71
-rw-r--r--fs/xattr.c27
-rw-r--r--fs/xfs/Kconfig13
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/kmem.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c26
-rw-r--r--fs/xfs/libxfs/xfs_attr.c26
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c13
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h10
-rw-r--r--fs/xfs/libxfs/xfs_bit.h24
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c60
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c34
-rw-r--r--fs/xfs/libxfs/xfs_btree.c54
-rw-r--r--fs/xfs/libxfs/xfs_btree.h33
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h16
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c28
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h64
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/libxfs/xfs_format.h113
-rw-r--r--fs/xfs/libxfs/xfs_fs.h16
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c53
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c36
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h31
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h256
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h6
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c59
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h16
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c34
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h46
-rw-r--r--fs/xfs/uuid.c63
-rw-r--r--fs/xfs/uuid.h35
-rw-r--r--fs/xfs/xfs.h4
-rw-r--r--fs/xfs/xfs_acl.c6
-rw-r--r--fs/xfs/xfs_acl.h1
-rw-r--r--fs/xfs/xfs_aops.c20
-rw-r--r--fs/xfs/xfs_attr.h3
-rw-r--r--fs/xfs/xfs_attr_list.c61
-rw-r--r--fs/xfs/xfs_bmap_item.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c174
-rw-r--r--fs/xfs/xfs_bmap_util.h4
-rw-r--r--fs/xfs/xfs_buf.c107
-rw-r--r--fs/xfs/xfs_buf.h6
-rw-r--r--fs/xfs/xfs_buf_item.c21
-rw-r--r--fs/xfs/xfs_dir2_readdir.c341
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_dquot.c85
-rw-r--r--fs/xfs/xfs_error.c319
-rw-r--r--fs/xfs/xfs_error.h44
-rw-r--r--fs/xfs/xfs_file.c445
-rw-r--r--fs/xfs/xfs_fsmap.c5
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_fsops.h4
-rw-r--r--fs/xfs/xfs_globals.c5
-rw-r--r--fs/xfs/xfs_icache.c61
-rw-r--r--fs/xfs/xfs_icache.h4
-rw-r--r--fs/xfs/xfs_inode.c25
-rw-r--r--fs/xfs/xfs_inode.h7
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl.h10
-rw-r--r--fs/xfs/xfs_ioctl32.h6
-rw-r--r--fs/xfs/xfs_iomap.c30
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_itable.h2
-rw-r--r--fs/xfs/xfs_linux.h23
-rw-r--r--fs/xfs/xfs_log.c87
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c91
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c57
-rw-r--r--fs/xfs/xfs_message.c5
-rw-r--r--fs/xfs/xfs_mount.c46
-rw-r--r--fs/xfs/xfs_mount.h60
-rw-r--r--fs/xfs/xfs_qm.c28
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_reflink.c101
-rw-r--r--fs/xfs/xfs_reflink.h8
-rw-r--r--fs/xfs/xfs_rtalloc.c8
-rw-r--r--fs/xfs/xfs_rtalloc.h3
-rw-r--r--fs/xfs/xfs_stats.c8
-rw-r--r--fs/xfs/xfs_stats.h190
-rw-r--r--fs/xfs/xfs_super.c30
-rw-r--r--fs/xfs/xfs_symlink.c12
-rw-r--r--fs/xfs/xfs_symlink.h1
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c81
-rw-r--r--fs/xfs/xfs_trace.h40
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_bmap.c11
-rw-r--r--fs/xfs/xfs_trans_buf.c21
-rw-r--r--fs/xfs/xfs_trans_rmap.c2
453 files changed, 12287 insertions, 9816 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 83eab52fb3f6..b0e42b6a96b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,7 @@ config FS_DAX
depends on MMU
depends on !(ARM || MIPS || SPARC)
select FS_IOMAP
+ select DAX
help
Direct Access (DAX) can be used on memory-backed block devices.
If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2f8bab390d13..773749be8290 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -7,7 +7,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/amigaffs.h>
+#include "amigaffs.h"
#include <linux/mutex.h>
#include <linux/workqueue.h>
@@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir,
struct dentry *dentry);
extern int affs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname);
-extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
new file mode 100644
index 000000000000..43b41c06aa37
--- /dev/null
+++ b/fs/affs/amigaffs.h
@@ -0,0 +1,144 @@
+#ifndef AMIGAFFS_H
+#define AMIGAFFS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define FS_OFS 0x444F5300
+#define FS_FFS 0x444F5301
+#define FS_INTLOFS 0x444F5302
+#define FS_INTLFFS 0x444F5303
+#define FS_DCOFS 0x444F5304
+#define FS_DCFFS 0x444F5305
+#define MUFS_FS 0x6d754653 /* 'muFS' */
+#define MUFS_OFS 0x6d754600 /* 'muF\0' */
+#define MUFS_FFS 0x6d754601 /* 'muF\1' */
+#define MUFS_INTLOFS 0x6d754602 /* 'muF\2' */
+#define MUFS_INTLFFS 0x6d754603 /* 'muF\3' */
+#define MUFS_DCOFS 0x6d754604 /* 'muF\4' */
+#define MUFS_DCFFS 0x6d754605 /* 'muF\5' */
+
+#define T_SHORT 2
+#define T_LIST 16
+#define T_DATA 8
+
+#define ST_LINKFILE -4
+#define ST_FILE -3
+#define ST_ROOT 1
+#define ST_USERDIR 2
+#define ST_SOFTLINK 3
+#define ST_LINKDIR 4
+
+#define AFFS_ROOT_BMAPS 25
+
+struct affs_date {
+ __be32 days;
+ __be32 mins;
+ __be32 ticks;
+};
+
+struct affs_short_date {
+ __be16 days;
+ __be16 mins;
+ __be16 ticks;
+};
+
+struct affs_root_head {
+ __be32 ptype;
+ __be32 spare1;
+ __be32 spare2;
+ __be32 hash_size;
+ __be32 spare3;
+ __be32 checksum;
+ __be32 hashtable[1];
+};
+
+struct affs_root_tail {
+ __be32 bm_flag;
+ __be32 bm_blk[AFFS_ROOT_BMAPS];
+ __be32 bm_ext;
+ struct affs_date root_change;
+ u8 disk_name[32];
+ __be32 spare1;
+ __be32 spare2;
+ struct affs_date disk_change;
+ struct affs_date disk_create;
+ __be32 spare3;
+ __be32 spare4;
+ __be32 dcache;
+ __be32 stype;
+};
+
+struct affs_head {
+ __be32 ptype;
+ __be32 key;
+ __be32 block_count;
+ __be32 spare1;
+ __be32 first_data;
+ __be32 checksum;
+ __be32 table[1];
+};
+
+struct affs_tail {
+ __be32 spare1;
+ __be16 uid;
+ __be16 gid;
+ __be32 protect;
+ __be32 size;
+ u8 comment[92];
+ struct affs_date change;
+ u8 name[32];
+ __be32 spare2;
+ __be32 original;
+ __be32 link_chain;
+ __be32 spare[5];
+ __be32 hash_chain;
+ __be32 parent;
+ __be32 extension;
+ __be32 stype;
+};
+
+struct slink_front
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 spare1[3];
+ __be32 checksum;
+ u8 symname[1]; /* depends on block size */
+};
+
+struct affs_data_head
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 sequence;
+ __be32 size;
+ __be32 next;
+ __be32 checksum;
+ u8 data[1]; /* depends on block size */
+};
+
+/* Permission bits */
+
+#define FIBF_OTR_READ 0x8000
+#define FIBF_OTR_WRITE 0x4000
+#define FIBF_OTR_EXECUTE 0x2000
+#define FIBF_OTR_DELETE 0x1000
+#define FIBF_GRP_READ 0x0800
+#define FIBF_GRP_WRITE 0x0400
+#define FIBF_GRP_EXECUTE 0x0200
+#define FIBF_GRP_DELETE 0x0100
+
+#define FIBF_HIDDEN 0x0080
+#define FIBF_SCRIPT 0x0040
+#define FIBF_PURE 0x0020 /* no use under linux */
+#define FIBF_ARCHIVED 0x0010 /* never set, always cleared on write */
+#define FIBF_NOREAD 0x0008 /* 0 means allowed */
+#define FIBF_NOWRITE 0x0004 /* 0 means allowed */
+#define FIBF_NOEXECUTE 0x0002 /* 0 means allowed, ignored under linux */
+#define FIBF_NODELETE 0x0001 /* 0 means allowed */
+
+#define FIBF_OWNER 0x000F /* Bits pertaining to owner */
+#define FIBF_MASK 0xEE0E /* Bits modified by Linux */
+
+#endif
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1e7294381c5..591ecd7f3063 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = {
.symlink = affs_symlink,
.mkdir = affs_mkdir,
.rmdir = affs_rmdir,
- .rename = affs_rename,
+ .rename = affs_rename2,
.setattr = affs_notify_change,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0deec9cc2362..196ee7f6fdc4 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block)
}
static int
-affs_do_readpage_ofs(struct page *page, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to, int create)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
@@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
boff = tmp % bsize;
while (pos < to) {
- bh = affs_bread_ino(inode, bidx, 0);
+ bh = affs_bread_ino(inode, bidx, create);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
@@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
memset(page_address(page) + to, 0, PAGE_SIZE - to);
}
- err = affs_do_readpage_ofs(page, to);
+ err = affs_do_readpage_ofs(page, to, 0);
if (!err)
SetPageUptodate(page);
unlock_page(page);
@@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
if (err) {
unlock_page(page);
put_page(page);
@@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
int written;
from = pos & (PAGE_SIZE - 1);
- to = pos + len;
+ to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
* we don't have to, because the page should always be uptodate here,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index abcc59899229..fd4ef3c40e40 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &affs_file_operations;
break;
case ST_SOFTLINK:
+ inode->i_size = strlen((char *)AFFS_HEAD(bh)->table);
inode->i_mode |= S_IFLNK;
inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 96dd1d09a273..46d3ace6761d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
symname++;
}
*p = 0;
+ inode->i_size = i + 1;
mark_buffer_dirty_inode(bh, inode);
affs_brelse(bh);
mark_inode_dirty(inode);
@@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
}
-int
+static int
affs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ struct inode *new_dir, struct dentry *new_dentry)
{
struct super_block *sb = old_dir->i_sb;
struct buffer_head *bh = NULL;
int retval;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
- old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
-
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
affs_nofilenametruncate(old_dentry));
@@ -447,6 +441,76 @@ done:
return retval;
}
+static int
+affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+
+ struct super_block *sb = old_dir->i_sb;
+ struct buffer_head *bh_old = NULL;
+ struct buffer_head *bh_new = NULL;
+ int retval;
+
+ bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino);
+ if (!bh_old)
+ return -EIO;
+
+ bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
+ if (!bh_new)
+ return -EIO;
+
+ /* Remove old header from its parent directory. */
+ affs_lock_dir(old_dir);
+ retval = affs_remove_hash(old_dir, bh_old);
+ affs_unlock_dir(old_dir);
+ if (retval)
+ goto done;
+
+ /* Remove new header from its parent directory. */
+ affs_lock_dir(new_dir);
+ retval = affs_remove_hash(new_dir, bh_new);
+ affs_unlock_dir(new_dir);
+ if (retval)
+ goto done;
+
+ /* Insert old into the new directory with the new name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry);
+ affs_fix_checksum(sb, bh_old);
+ affs_lock_dir(new_dir);
+ retval = affs_insert_hash(new_dir, bh_old);
+ affs_unlock_dir(new_dir);
+
+ /* Insert new into the old directory with the old name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry);
+ affs_fix_checksum(sb, bh_new);
+ affs_lock_dir(old_dir);
+ retval = affs_insert_hash(old_dir, bh_new);
+ affs_unlock_dir(old_dir);
+done:
+ mark_buffer_dirty_inode(bh_old, new_dir);
+ mark_buffer_dirty_inode(bh_new, old_dir);
+ affs_brelse(bh_old);
+ affs_brelse(bh_new);
+ return retval;
+}
+
+int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
+ old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
+
+ if (flags & RENAME_EXCHANGE)
+ return affs_xrename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return affs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static struct dentry *affs_get_parent(struct dentry *child)
{
struct inode *parent;
@@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
-
return inode;
}
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4f64b95d57bd..095c54165dfd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -27,6 +27,7 @@ kafs-objs := \
vlocation.o \
vnode.o \
volume.o \
- write.o
+ write.o \
+ xattr.o
obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3062cceb5c2a..782d4d05a53b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -350,7 +350,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
struct afs_server *server;
- struct uuid_v1 *r;
+ struct afs_uuid *r;
unsigned loop;
__be32 *b;
int ret;
@@ -380,7 +380,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
}
_debug("unmarshall UUID");
- call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
+ call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
if (!call->request)
return -ENOMEM;
@@ -453,7 +453,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
static void SRXAFSCB_ProbeUuid(struct work_struct *work)
{
struct afs_call *call = container_of(work, struct afs_call, work);
- struct uuid_v1 *r = call->request;
+ struct afs_uuid *r = call->request;
struct {
__be32 match;
@@ -476,7 +476,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
*/
static int afs_deliver_cb_probe_uuid(struct afs_call *call)
{
- struct uuid_v1 *r;
+ struct afs_uuid *r;
unsigned loop;
__be32 *b;
int ret;
@@ -502,15 +502,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
}
_debug("unmarshall UUID");
- call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
+ call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
if (!call->request)
return -ENOMEM;
b = call->buffer;
r = call->request;
- r->time_low = b[0];
- r->time_mid = htons(ntohl(b[1]));
- r->time_hi_and_version = htons(ntohl(b[2]));
+ r->time_low = ntohl(b[0]);
+ r->time_mid = ntohl(b[1]);
+ r->time_hi_and_version = ntohl(b[2]);
r->clock_seq_hi_and_reserved = ntohl(b[3]);
r->clock_seq_low = ntohl(b[4]);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 949f960337f5..613a77058263 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -61,6 +61,7 @@ const struct inode_operations afs_dir_inode_operations = {
.permission = afs_permission,
.getattr = afs_getattr,
.setattr = afs_setattr,
+ .listxattr = afs_listxattr,
};
const struct dentry_operations afs_fs_dentry_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d5b8508869b..510cba15fa56 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -46,6 +46,7 @@ const struct inode_operations afs_file_inode_operations = {
.getattr = afs_getattr,
.setattr = afs_setattr,
.permission = afs_permission,
+ .listxattr = afs_listxattr,
};
const struct address_space_operations afs_fs_aops = {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index aae55dd15108..342316a9e3e0 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -28,6 +28,11 @@ struct afs_iget_data {
struct afs_volume *volume; /* volume on which resides */
};
+static const struct inode_operations afs_symlink_inode_operations = {
+ .get_link = page_get_link,
+ .listxattr = afs_listxattr,
+};
+
/*
* map the AFS file status to the inode member variables
*/
@@ -67,7 +72,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
inode->i_fop = &afs_mntpt_file_operations;
} else {
inode->i_mode = S_IFLNK | vnode->status.mode;
- inode->i_op = &page_symlink_inode_operations;
+ inode->i_op = &afs_symlink_inode_operations;
}
inode_nohighmem(inode);
break;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 393672997cc2..82e16556afea 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -410,6 +410,15 @@ struct afs_interface {
unsigned mtu; /* MTU of interface */
};
+struct afs_uuid {
+ __be32 time_low; /* low part of timestamp */
+ __be16 time_mid; /* mid part of timestamp */
+ __be16 time_hi_and_version; /* high part of timestamp and version */
+ __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */
+ __u8 clock_seq_low; /* clock seq low */
+ __u8 node[6]; /* spatially unique node ID (MAC addr) */
+};
+
/*****************************************************************************/
/*
* cache.c
@@ -544,7 +553,7 @@ extern int afs_drop_inode(struct inode *);
* main.c
*/
extern struct workqueue_struct *afs_wq;
-extern struct uuid_v1 afs_uuid;
+extern struct afs_uuid afs_uuid;
/*
* misc.c
@@ -722,6 +731,11 @@ extern int afs_writeback_all(struct afs_vnode *);
extern int afs_flush(struct file *, fl_owner_t);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
+/*
+ * xattr.c
+ */
+extern const struct xattr_handler *afs_xattr_handlers[];
+extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
/*****************************************************************************/
/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 51d7d17bca57..9944770849da 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,7 +31,7 @@ static char *rootcell;
module_param(rootcell, charp, 0);
MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-struct uuid_v1 afs_uuid;
+struct afs_uuid afs_uuid;
struct workqueue_struct *afs_wq;
/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index bd3b65cde282..690fea9d84c3 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -35,6 +35,7 @@ const struct inode_operations afs_mntpt_inode_operations = {
.lookup = afs_mntpt_lookup,
.readlink = page_readlink,
.getattr = afs_getattr,
+ .listxattr = afs_listxattr,
};
const struct inode_operations afs_autocell_inode_operations = {
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d5990eb160bd..02781e78ffb6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -341,6 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
struct msghdr msg;
struct kvec iov[1];
size_t offset;
+ s64 tx_total_len;
u32 abort_code;
int ret;
@@ -364,9 +365,20 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
srx.transport.sin.sin_port = call->port;
memcpy(&srx.transport.sin.sin_addr, addr, 4);
+ /* Work out the length we're going to transmit. This is awkward for
+ * calls such as FS.StoreData where there's an extra injection of data
+ * after the initial fixed part.
+ */
+ tx_total_len = call->request_size;
+ if (call->send_pages) {
+ tx_total_len += call->last_to - call->first_offset;
+ tx_total_len += (call->last - call->first) * PAGE_SIZE;
+ }
+
/* create a call */
rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
- (unsigned long) call, gfp,
+ (unsigned long)call,
+ tx_total_len, gfp,
(async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter));
@@ -738,6 +750,8 @@ void afs_send_empty_reply(struct afs_call *call)
_enter("");
+ rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0);
+
msg.msg_name = NULL;
msg.msg_namelen = 0;
iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
@@ -772,6 +786,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
_enter("");
+ rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len);
+
iov[0].iov_base = (void *) buf;
iov[0].iov_len = len;
msg.msg_name = NULL;
diff --git a/fs/afs/security.c b/fs/afs/security.c
index ecb86a670180..faca66227ecf 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -327,12 +327,11 @@ int afs_permission(struct inode *inode, int mask)
if (!(access & AFS_ACE_LOOKUP))
goto permission_denied;
} else if (mask & MAY_READ) {
- if (!(access & AFS_ACE_READ))
+ if (!(access & AFS_ACE_LOOKUP))
goto permission_denied;
} else if (mask & MAY_WRITE) {
if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
- AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
- AFS_ACE_WRITE))) /* chmod */
+ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */
goto permission_denied;
} else {
BUG();
diff --git a/fs/afs/super.c b/fs/afs/super.c
index c79633e5cfd8..67680c2d96cf 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,6 +319,7 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
+ sb->s_xattr = afs_xattr_handlers;
ret = super_setup_bdi(sb);
if (ret)
return ret;
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
new file mode 100644
index 000000000000..2830e4f48d85
--- /dev/null
+++ b/fs/afs/xattr.c
@@ -0,0 +1,121 @@
+/* Extended attribute handling for AFS. We use xattrs to get and set metadata
+ * instead of providing pioctl().
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include "internal.h"
+
+static const char afs_xattr_list[] =
+ "afs.cell\0"
+ "afs.fid\0"
+ "afs.volume";
+
+/*
+ * Retrieve a list of the supported xattrs.
+ */
+ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ if (size == 0)
+ return sizeof(afs_xattr_list);
+ if (size < sizeof(afs_xattr_list))
+ return -ERANGE;
+ memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list));
+ return sizeof(afs_xattr_list);
+}
+
+/*
+ * Get the name of the cell on which a file resides.
+ */
+static int afs_xattr_get_cell(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct afs_cell *cell = vnode->volume->cell;
+ size_t namelen;
+
+ namelen = strlen(cell->name);
+ if (size == 0)
+ return namelen;
+ if (namelen > size)
+ return -ERANGE;
+ memcpy(buffer, cell->name, size);
+ return namelen;
+}
+
+static const struct xattr_handler afs_xattr_afs_cell_handler = {
+ .name = "afs.cell",
+ .get = afs_xattr_get_cell,
+};
+
+/*
+ * Get the volume ID, vnode ID and vnode uniquifier of a file as a sequence of
+ * hex numbers separated by colons.
+ */
+static int afs_xattr_get_fid(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ char text[8 + 1 + 8 + 1 + 8 + 1];
+ size_t len;
+
+ len = sprintf(text, "%x:%x:%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+ if (size == 0)
+ return len;
+ if (len > size)
+ return -ERANGE;
+ memcpy(buffer, text, len);
+ return len;
+}
+
+static const struct xattr_handler afs_xattr_afs_fid_handler = {
+ .name = "afs.fid",
+ .get = afs_xattr_get_fid,
+};
+
+/*
+ * Get the name of the volume on which a file resides.
+ */
+static int afs_xattr_get_volume(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ const char *volname = vnode->volume->vlocation->vldb.name;
+ size_t namelen;
+
+ namelen = strlen(volname);
+ if (size == 0)
+ return namelen;
+ if (namelen > size)
+ return -ERANGE;
+ memcpy(buffer, volname, size);
+ return namelen;
+}
+
+static const struct xattr_handler afs_xattr_afs_volume_handler = {
+ .name = "afs.volume",
+ .get = afs_xattr_get_volume,
+};
+
+const struct xattr_handler *afs_xattr_handlers[] = {
+ &afs_xattr_afs_cell_handler,
+ &afs_xattr_afs_fid_handler,
+ &afs_xattr_afs_volume_handler,
+ NULL
+};
diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..dcad3a66748c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
ssize_t ret;
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+ if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->common.ki_pos = iocb->aio_offset;
req->common.ki_complete = aio_complete;
req->common.ki_flags = iocb_flags(req->common.ki_filp);
+ req->common.ki_hint = file_write_hint(file);
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
@@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->common.ki_flags |= IOCB_EVENTFD;
}
+ ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
+ if (unlikely(ret)) {
+ pr_debug("EINVAL: aio_rw_flags\n");
+ goto out_put_req;
+ }
+
+ if ((req->common.ki_flags & IOCB_NOWAIT) &&
+ !(req->common.ki_flags & IOCB_DIRECT)) {
+ ret = -EOPNOTSUPP;
+ goto out_put_req;
+ }
+
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 734cbf8d9676..dd9f1bebb5a3 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp,
int status;
token = (autofs_wqt_t) param->fail.token;
- status = param->fail.status ? param->fail.status : -ENOENT;
+ status = param->fail.status < 0 ? param->fail.status : -ENOENT;
return autofs4_wait_release(sbi, token, status);
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f2deec0a62f0..25e312cb6071 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,7 +1,7 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
*
* Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
@@ -19,7 +19,7 @@
#include <linux/uaccess.h>
#include "bfs.h"
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bee1a36bc2ec..f4718098ac31 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -818,7 +818,7 @@ static const struct super_operations s_ops = {
static int bm_fill_super(struct super_block *sb, void *data, int silent)
{
int err;
- static struct tree_descr bm_files[] = {
+ static const struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2a305c1a2d88..9941dc8342df 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio_init(&bio, vecs, nr_pages);
bio.bi_bdev = bdev;
bio.bi_iter.bi_sector = pos >> 9;
+ bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
@@ -262,8 +263,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if (vecs != inline_vecs)
kfree(vecs);
- if (unlikely(bio.bi_error))
- return bio.bi_error;
+ if (unlikely(bio.bi_status))
+ ret = blk_status_to_errno(bio.bi_status);
+
+ bio_uninit(&bio);
+
return ret;
}
@@ -288,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio)
bool should_dirty = dio->should_dirty;
if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
- if (bio->bi_error && !dio->bio.bi_error)
- dio->bio.bi_error = bio->bi_error;
+ if (bio->bi_status && !dio->bio.bi_status)
+ dio->bio.bi_status = bio->bi_status;
} else {
if (!dio->is_sync) {
struct kiocb *iocb = dio->iocb;
- ssize_t ret = dio->bio.bi_error;
+ ssize_t ret;
- if (likely(!ret)) {
+ if (likely(!dio->bio.bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
+ } else {
+ ret = blk_status_to_errno(dio->bio.bi_status);
}
dio->iocb->ki_complete(iocb, ret, 0);
@@ -334,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
- int ret;
+ int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -358,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
for (;;) {
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = pos >> 9;
+ bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
- bio->bi_error = ret;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
break;
}
@@ -412,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
}
__set_current_state(TASK_RUNNING);
- ret = dio->bio.bi_error;
+ if (!ret)
+ ret = blk_status_to_errno(dio->bio.bi_status);
if (likely(!ret))
ret = dio->size;
@@ -436,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+ blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
if (!blkdev_dio_pool)
return -ENOMEM;
return 0;
@@ -624,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct block_device *bdev = I_BDEV(bd_inode);
int error;
- error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+ error = file_write_and_wait_range(filp, start, end);
if (error)
return error;
@@ -717,72 +725,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
- pgoff_t *pgoff)
-{
- phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
-
- if (pgoff)
- *pgoff = PHYS_PFN(phys_off);
- if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
- return -EINVAL;
- return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
-/**
- * bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
- * @blocksize: The block size of the device
- *
- * This is a library function for filesystems to check if the block device
- * can be mounted with dax option.
- *
- * Return: negative errno if unsupported, 0 if supported.
- */
-int bdev_dax_supported(struct super_block *sb, int blocksize)
-{
- struct block_device *bdev = sb->s_bdev;
- struct dax_device *dax_dev;
- pgoff_t pgoff;
- int err, id;
- void *kaddr;
- pfn_t pfn;
- long len;
-
- if (blocksize != PAGE_SIZE) {
- vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
- return -EINVAL;
- }
-
- err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
- if (err) {
- vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
- return err;
- }
-
- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
- if (!dax_dev) {
- vfs_msg(sb, KERN_ERR, "error: device does not support dax");
- return -EOPNOTSUPP;
- }
-
- id = dax_read_lock();
- len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
- dax_read_unlock(id);
-
- put_dax(dax_dev);
-
- if (len < 1) {
- vfs_msg(sb, KERN_ERR,
- "error: dax access failed (%ld)", len);
- return len < 0 ? len : -EIO;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
/*
* pseudo-fs
*/
@@ -1809,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return blkdev_get(bdev, filp->f_mode, filp);
}
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 247b8dfaf6e5..8d8370ddb6b2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
- if (acl) {
- ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
- if (ret)
- return ret;
- }
- ret = 0;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
@@ -119,6 +113,13 @@ out:
int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ int ret;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (ret)
+ return ret;
+ }
return __btrfs_set_acl(NULL, inode, acl, type);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7699e16784d3..f723c11bb763 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,7 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
@@ -26,6 +26,11 @@
#include "delayed-ref.h"
#include "locking.h"
+enum merge_mode {
+ MERGE_IDENTICAL_KEYS = 1,
+ MERGE_IDENTICAL_PARENTS,
+};
+
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* slot==nritems. In that case, go to the next leaf before we continue.
*/
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == (u64)-1)
+ else if (time_seq == SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
0, 0);
else
@@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
/*
* merge backrefs and adjust counts accordingly
*
- * mode = 1: merge identical keys, if key is set
- * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
- * additionally, we could even add a key range for the blocks we
- * looked into to merge even more (-> replace unresolved refs by those
- * having a parent).
- * mode = 2: merge identical parents
+ * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref
+ * then we can merge more here. Additionally, we could even add a key
+ * range for the blocks we looked into to merge even more (-> replace
+ * unresolved refs by those having a parent).
*/
-static void __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, enum merge_mode mode)
{
struct __prelim_ref *pos1;
@@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode)
if (!ref_for_same_block(ref1, ref2))
continue;
- if (mode == 1) {
+ if (mode == MERGE_IDENTICAL_KEYS) {
if (!ref1->parent && ref2->parent)
swap(ref1, ref2);
} else {
@@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
- * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
* much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
@@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1273,9 +1276,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != (u64)-1) {
+ time_seq != SEQ_LAST) {
#else
- if (trans && time_seq != (u64)-1) {
+ if (trans && time_seq != SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1286,7 +1289,7 @@ again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -1374,7 +1377,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 1);
+ __merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
extent_item_pos, total_refs,
@@ -1382,7 +1385,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 2);
+ __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
@@ -2302,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = vmalloc(alloc_bytes);
+ data = kvmalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -2336,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
if (IS_ERR(fspath))
return (void *)fspath;
- ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+ ifp = kmalloc(sizeof(*ifp), GFP_KERNEL);
if (!ifp) {
- vfree(fspath);
+ kvfree(fspath);
return ERR_PTR(-ENOMEM);
}
@@ -2353,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath)
{
if (!ipath)
return;
- vfree(ipath->fspath);
+ kvfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0c6baaba0651..d87ac27a5f2b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -125,6 +125,13 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes.
+ */
+ u64 new_delalloc_bytes;
+
+ /*
* total number of bytes pending defrag, used by stat to check whether
* it needs COW.
*/
@@ -303,7 +310,8 @@ struct btrfs_dio_private {
* The original bio may be split to several sub-bios, this is
* done during endio of sub-bios
*/
- int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
+ blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
+ blk_status_t);
};
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ab14c2e635ca..11d37c94ce05 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,7 +94,7 @@
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/string.h>
#include "ctree.h"
#include "disk-io.h"
@@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
- if (!bio) {
- pr_info("btrfsic: bio_alloc() for %u pages failed!\n",
- num_pages - i);
- return -1;
- }
+ bio = btrfs_io_bio_alloc(num_pages - i);
bio->bi_bdev = block_ctx->dev->bdev;
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
@@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
dev_bytenr += (j - i) * PAGE_SIZE;
i = j;
}
- for (i = 0; i < num_pages; i++) {
+ for (i = 0; i < num_pages; i++)
block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
- if (!block_ctx->datav[i]) {
- pr_info("btrfsic: kmap() failed (dev %s)!\n",
- block_ctx->dev->name);
- return -1;
- }
- }
return block_ctx->len;
}
@@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
/* mutex is not held! This is not save if IO is not yet completed
* on umount */
iodone_w_error = 0;
- if (bp->bi_error)
+ if (bp->bi_status)
iodone_w_error = 1;
BUG_ON(NULL == block);
@@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
- bp->bi_error,
+ bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
block->logical_bytenr, dev_state->name,
block->dev_bytenr, block->mirror_num);
@@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i;
+ unsigned int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
int bio_is_patched;
char **mapped_datav;
+ unsigned int segs = bio_segments(bio);
dev_bytenr = 512 * bio->bi_iter.bi_sector;
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_vcnt,
+ bio_op(bio), bio->bi_opf, segs,
(unsigned long long)bio->bi_iter.bi_sector,
dev_bytenr, bio->bi_bdev);
- mapped_datav = kmalloc_array(bio->bi_vcnt,
+ mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
- bio_for_each_segment_all(bvec, bio, i) {
- BUG_ON(bvec->bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec->bv_page);
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = kmap(bvec.bv_page);
+ i++;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec->bv_len, bvec->bv_offset);
- cur_bytenr += bvec->bv_len;
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, bio->bi_vcnt,
+ mapped_datav, segs,
bio, &bio_is_patched,
NULL, bio->bi_opf);
- bio_for_each_segment_all(bvec, bio, i)
- kunmap(bvec->bv_page);
+ bio_for_each_segment(bvec, bio, iter)
+ kunmap(bvec.bv_page);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
@@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
fs_info->sectorsize, PAGE_SIZE);
return -1;
}
- state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ state = kvzalloc(sizeof(*state), GFP_KERNEL);
if (!state) {
- state = vzalloc(sizeof(*state));
- if (!state) {
- pr_info("btrfs check-integrity: vzalloc() failed!\n");
- return -1;
- }
+ pr_info("btrfs check-integrity: allocation failed!\n");
+ return -1;
}
if (!btrfsic_is_initialized) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c7721a6aa3bb..2c0b7b57fcd5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,6 +32,7 @@
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -42,48 +43,7 @@
#include "extent_io.h"
#include "extent_map.h"
-struct compressed_bio {
- /* number of bios pending for this compressed extent */
- atomic_t pending_bios;
-
- /* the pages with the compressed data on them */
- struct page **compressed_pages;
-
- /* inode that owns this data */
- struct inode *inode;
-
- /* starting offset in the inode for our pages */
- u64 start;
-
- /* number of bytes in the inode we're working on */
- unsigned long len;
-
- /* number of bytes on disk */
- unsigned long compressed_len;
-
- /* the compression algorithm for this bio */
- int compress_type;
-
- /* number of compressed pages in the array */
- unsigned long nr_pages;
-
- /* IO errors */
- int errors;
- int mirror_num;
-
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u32 sums;
-};
-
-static int btrfs_decompress_bio(int type, struct page **pages_in,
- u64 disk_start, struct bio *orig_bio,
- size_t srclen);
+static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
@@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
-static struct bio *compressed_bio_alloc(struct block_device *bdev,
- u64 first_byte, gfp_t gfp_flags)
-{
- return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
-}
-
static int check_compressed_csum(struct btrfs_inode *inode,
struct compressed_bio *cb,
u64 disk_start)
@@ -155,13 +109,13 @@ static void end_compressed_bio_read(struct bio *bio)
unsigned long index;
int ret;
- if (bio->bi_error)
+ if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
inode = cb->inode;
@@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
- ret = btrfs_decompress_bio(cb->compress_type,
- cb->compressed_pages,
- cb->start,
- cb->orig_bio,
- cb->compressed_len);
+ ret = btrfs_decompress_bio(cb);
+
csum_failed:
if (ret)
cb->errors = 1;
@@ -268,13 +219,13 @@ static void end_compressed_bio_write(struct bio *bio)
struct page *page;
unsigned long index;
- if (bio->bi_error)
+ if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
/* ok, we're the last bio for this extent, step one is to
@@ -287,7 +238,7 @@ static void end_compressed_bio_write(struct bio *bio)
cb->start,
cb->start + cb->len - 1,
NULL,
- bio->bi_error ? 0 : 1);
+ bio->bi_status ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -320,7 +271,7 @@ out:
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
@@ -335,14 +286,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page *page;
u64 first_byte = disk_start;
struct block_device *bdev;
- int ret;
+ blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
WARN_ON(start & ((u64)PAGE_SIZE - 1));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
- return -ENOMEM;
- atomic_set(&cb->pending_bios, 0);
+ return BLK_STS_RESOURCE;
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->start = start;
@@ -355,30 +306,26 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = fs_info->fs_devices->latest_bdev;
- bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
- if (!bio) {
- kfree(cb);
- return -ENOMEM;
- }
+ bio = btrfs_bio_alloc(bdev, first_byte);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+ int submit = 0;
+
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
- ret = io_tree->ops->merge_bio_hook(page, 0,
+ submit = io_tree->ops->merge_bio_hook(page, 0,
PAGE_SIZE,
bio, 0);
- else
- ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+ if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
bio_get(bio);
@@ -388,7 +335,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -400,14 +347,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
bio_put(bio);
- bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
- BUG_ON(!bio);
+ bio = btrfs_bio_alloc(bdev, first_byte);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@@ -434,7 +380,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
@@ -569,7 +515,7 @@ next:
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -586,7 +532,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- int ret = -ENOMEM;
+ blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
u32 *sums;
@@ -600,14 +546,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
- return -EIO;
+ return BLK_STS_IOERR;
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
goto out;
- atomic_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -638,7 +584,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
__GFP_HIGHMEM);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
- ret = -ENOMEM;
+ ret = BLK_STS_RESOURCE;
goto fail2;
}
}
@@ -650,28 +596,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
- if (!comp_bio)
- goto fail2;
+ comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ int submit = 0;
+
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- ret = tree->ops->merge_bio_hook(page, 0,
+ submit = tree->ops->merge_bio_hook(page, 0,
PAGE_SIZE,
comp_bio, 0);
- else
- ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+ if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
bio_get(comp_bio);
@@ -685,7 +629,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
@@ -697,15 +641,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
- comp_bio->bi_error = ret;
+ comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
bio_put(comp_bio);
- comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
- GFP_NOFS);
- BUG_ON(!comp_bio);
+ comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -726,7 +668,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
- comp_bio->bi_error = ret;
+ comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
@@ -801,6 +743,7 @@ static struct list_head *find_workspace(int type)
struct list_head *workspace;
int cpus = num_online_cpus();
int idx = type - 1;
+ unsigned nofs_flag;
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
@@ -830,7 +773,15 @@ again:
atomic_inc(total_ws);
spin_unlock(ws_lock);
+ /*
+ * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
+ * to turn it off here because we might get called from the restricted
+ * context of btrfs_compress_bio/btrfs_compress_pages
+ */
+ nofs_flag = memalloc_nofs_save();
workspace = btrfs_compress_op[idx]->alloc_workspace();
+ memalloc_nofs_restore(nofs_flag);
+
if (IS_ERR(workspace)) {
atomic_dec(total_ws);
wake_up(ws_wait);
@@ -961,19 +912,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
-static int btrfs_decompress_bio(int type, struct page **pages_in,
- u64 disk_start, struct bio *orig_bio,
- size_t srclen)
+static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
int ret;
+ int type = cb->compress_type;
workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in,
- disk_start, orig_bio,
- srclen);
+ ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
free_workspace(type, workspace);
+
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 39ec43ab8df1..87f6d3332163 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,45 @@
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ refcount_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* the compression algorithm for this bio */
+ int compress_type;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+ int mirror_num;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+
+ /*
+ * the start of a variable length array of checksums only
+ * used by reads
+ */
+ u32 sums;
+};
+
void btrfs_init_compress(void);
void btrfs_exit_compress(void);
@@ -48,12 +87,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio *bio);
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages);
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
enum btrfs_compression_type {
@@ -78,10 +117,7 @@ struct btrfs_compress_op {
unsigned long *total_out);
int (*decompress_bio)(struct list_head *workspace,
- struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen);
+ struct compressed_bio *cb);
int (*decompress)(struct list_head *workspace,
unsigned char *data_in,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7dc8844037e0..3f4daa9d6e2c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,7 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
- int nr_items, gfp_t flags)
+ int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
@@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (!tree_mod_need_log(fs_info, eb))
return 0;
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
- struct extent_buffer *new_root, gfp_t flags,
+ struct extent_buffer *new_root,
int log_removal)
{
struct tree_mod_elem *tm = NULL;
@@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- flags);
+ GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
goto free_tms;
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
}
}
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
{
int ret;
ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
- nr_items, GFP_NOFS);
+ nr_items);
BUG_ON(ret < 0);
}
@@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root,
{
int ret;
ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, GFP_NOFS, log_removal);
+ new_root_node, log_removal);
BUG_ON(ret < 0);
}
@@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
/* make room in the right data area */
data_end = leaf_data_end(fs_info, right);
memmove_extent_buffer(right,
- btrfs_leaf_data(right) + data_end - push_space,
- btrfs_leaf_data(right) + data_end,
+ BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
- copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+ copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- btrfs_leaf_data(left) + leaf_data_end(fs_info, left),
+ BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset_nr(right, push_items - 1);
- copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+ copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, left) - push_space,
- btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_OFFSET +
btrfs_item_offset_nr(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
@@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
leaf_data_end(fs_info, right);
- memmove_extent_buffer(right, btrfs_leaf_data(right) +
+ memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
@@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems * sizeof(struct btrfs_item));
copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) -
- data_copy_size, btrfs_leaf_data(l) +
+ BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
+ data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
@@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
/* shift the data */
if (from_end) {
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end + size_diff, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
@@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
}
}
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end + size_diff, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
@@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
/* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - data_size, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
@@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - total_data, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
}
@@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
int data_end = leaf_data_end(fs_info, leaf);
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
- btrfs_leaf_data(leaf) + data_end,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
last_off - data_end);
for (i = slot + nr; i < nritems; i++) {
@@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
if (!tmp_buf) {
- tmp_buf = vmalloc(fs_info->nodesize);
- if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
left_path->search_commit_root = 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e21211e99c3..3f3eb7b17cac 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,6 +39,7 @@
#include <linux/security.h>
#include <linux/sizes.h>
#include <linux/dynamic_debug.h>
+#include <linux/refcount.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -47,7 +48,6 @@ struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
@@ -518,7 +518,7 @@ struct btrfs_caching_control {
struct btrfs_work work;
struct btrfs_block_group_cache *block_group;
u64 progress;
- atomic_t count;
+ refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
@@ -538,6 +538,14 @@ struct btrfs_io_ctl {
unsigned check_crcs:1;
};
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -648,6 +656,9 @@ struct btrfs_block_group_cache {
* Protected by free_space_lock.
*/
int needs_free_space;
+
+ /* Record locked full stripes for RAID5/6 block group */
+ struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
/* delayed seq elem */
@@ -658,6 +669,8 @@ struct seq_list {
#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define SEQ_LAST ((u64)-1)
+
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -702,6 +715,15 @@ struct btrfs_delayed_root;
#define BTRFS_FS_BTREE_ERR 11
#define BTRFS_FS_LOG1_ERR 12
#define BTRFS_FS_LOG2_ERR 13
+#define BTRFS_FS_QUOTA_OVERRIDE 14
+/* Used to record internally whether fs has been frozen */
+#define BTRFS_FS_FROZEN 15
+
+/*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+#define BTRFS_FS_EXCL_OP 14
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -729,8 +751,7 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
/* keep track of unallocated space */
- spinlock_t free_chunk_lock;
- u64 free_chunk_space;
+ atomic64_t free_chunk_space;
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
@@ -778,17 +799,7 @@ struct btrfs_fs_info {
* so it is also safe.
*/
u64 max_inline;
- /*
- * Protected by ->chunk_mutex and sb->s_umount.
- *
- * The reason that we use two lock to protect it is because only
- * remount and mount operations can change it and these two operations
- * are under sb->s_umount, but the read side (chunk allocation) can not
- * acquire sb->s_umount or the deadlock would happen. So we use two
- * locks to protect it. On the write side, we must acquire two locks,
- * and on the read side, we just need acquire one of them.
- */
- u64 alloc_start;
+
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
wait_queue_head_t transaction_wait;
@@ -1066,8 +1077,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- atomic_t mutually_exclusive_operation_running;
-
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
@@ -1090,9 +1099,6 @@ struct btrfs_fs_info {
*/
struct list_head pinned_chunks;
- /* Used to record internally whether fs has been frozen */
- int fs_frozen;
-
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
@@ -1220,7 +1226,7 @@ struct btrfs_root {
dev_t anon_dev;
spinlock_t root_item_lock;
- atomic_t refs;
+ refcount_t refs;
struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
@@ -1260,21 +1266,20 @@ struct btrfs_root {
/* For qgroup metadata space reserve */
atomic64_t qgroup_meta_rsv;
};
+
static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
{
return btrfs_sb(inode->i_sb)->sectorsize;
}
-static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
-{
- return blocksize - sizeof(struct btrfs_header);
-}
-
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
- return __BTRFS_LEAF_DATA_SIZE(info->nodesize);
+
+ return info->nodesize - sizeof(struct btrfs_header);
}
+#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
+
static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1536,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
s->member = cpu_to_le##bits(val); \
}
+
+static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+{
+ BUILD_BUG_ON(sizeof(u64) !=
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+{
+ BUILD_BUG_ON(sizeof(u64) !=
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+
BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
@@ -2307,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
return btrfs_csum_sizes[t];
}
-static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
-{
- return offsetof(struct btrfs_leaf, items);
-}
/*
* The leaf data grows from end-to-front in the node.
@@ -2521,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
- ((type *)(btrfs_leaf_data(leaf) + \
+ ((type *)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
- ((unsigned long)(btrfs_leaf_data(leaf) + \
+ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
@@ -2546,7 +2566,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
- return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}
/*
@@ -2556,7 +2576,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
- return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2663,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
+u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
+u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
+u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
enum btrfs_reserve_flush_enum {
@@ -2686,9 +2708,13 @@ enum btrfs_flush_state {
COMMIT_TRANS = 6,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2705,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
@@ -3014,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod);
int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
+ struct extent_buffer *leaf, int slot,
struct btrfs_dir_item *dir_item);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
+bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
+ unsigned long start, u16 name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3061,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -3077,7 +3105,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
@@ -3154,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
int btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
@@ -3646,6 +3675,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
+static inline void btrfs_init_full_stripe_locks_tree(
+ struct btrfs_full_stripe_locks_tree *locks_root)
+{
+ locks_root->root = RB_ROOT;
+ mutex_init(&locks_root->lock);
+}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
@@ -3670,8 +3705,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err);
+int btree_readahead_hook(struct extent_buffer *eb, int err);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1aff676f0e5b..8ae409b5a61d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node(
{
delayed_node->root = root;
delayed_node->inode_id = inode_id;
- atomic_set(&delayed_node->refs, 0);
+ refcount_set(&delayed_node->refs, 0);
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
@@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
return node;
}
@@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
- atomic_inc(&node->refs); /* can be accessed */
+ refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
spin_unlock(&root->inode_lock);
return node;
}
btrfs_inode->delayed_node = node;
/* can be accessed and cached in the inode */
- atomic_add(2, &node->refs);
+ refcount_add(2, &node->refs);
spin_unlock(&root->inode_lock);
return node;
}
@@ -125,7 +125,7 @@ again:
btrfs_init_delayed_node(node, root, ino);
/* cached in the btrfs inode and can be accessed */
- atomic_add(2, &node->refs);
+ refcount_set(&node->refs, 2);
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
@@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
} else {
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
- atomic_inc(&node->refs); /* inserted into list */
+ refcount_inc(&node->refs); /* inserted into list */
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
- atomic_dec(&node->refs); /* not in the list */
+ refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
list_del_init(&node->p_list);
@@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node(
p = delayed_root->node_list.next;
node = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
p = node->n_list.next;
next = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
- if (atomic_dec_and_test(&delayed_node->refs)) {
+ if (refcount_dec_and_test(&delayed_node->refs)) {
bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
- if (atomic_read(&delayed_node->refs) == 0) {
+ if (refcount_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
free = true;
@@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
p = delayed_root->prepare_list.next;
list_del_init(p);
node = list_entry(p, struct btrfs_delayed_node, p_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
item->ins_or_del = 0;
item->bytes_reserved = 0;
item->delayed_node = NULL;
- atomic_set(&item->refs, 1);
+ refcount_set(&item->refs, 1);
}
return item;
}
@@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
{
if (item) {
__btrfs_remove_delayed_item(item);
- if (atomic_dec_and_test(&item->refs))
+ if (refcount_dec_and_test(&item->refs))
kfree(item);
}
}
@@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, ins_list);
item = __btrfs_next_delayed_item(item);
}
item = __btrfs_first_delayed_deletion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, del_list);
item = __btrfs_next_delayed_item(item);
}
@@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
- atomic_dec(&delayed_node->refs);
+ refcount_dec(&delayed_node->refs);
return true;
}
@@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
list_for_each_entry_safe(curr, next, del_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
@@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
list_del(&curr->readdir_list);
ret = (curr->key.offset == index);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (ret)
@@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_del(&curr->readdir_list);
if (curr->key.offset < ctx->pos) {
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
@@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
over = !dir_emit(ctx, name, name_len,
location.objectid, d_type);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
@@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
inode_id = delayed_nodes[n - 1]->inode_id + 1;
for (i = 0; i < n; i++)
- atomic_inc(&delayed_nodes[i]->refs);
+ refcount_inc(&delayed_nodes[i]->refs);
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 40327cc3b99a..c4189d495934 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -26,7 +26,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/atomic.h>
-
+#include <linux/refcount.h>
#include "ctree.h"
/* types of the delayed item */
@@ -67,7 +67,7 @@ struct btrfs_delayed_node {
struct rb_root del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
- atomic_t refs;
+ refcount_t refs;
u64 index_cnt;
unsigned long flags;
int count;
@@ -80,7 +80,7 @@ struct btrfs_delayed_item {
struct list_head readdir_list; /* used for readdir items */
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
- atomic_t refs;
+ refcount_t refs;
int ins_or_del;
u32 data_len;
char data[0];
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6eb80952efb3..93ffa898df6d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
if (mutex_trylock(&head->mutex))
return 0;
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -470,7 +470,8 @@ add_tail:
static noinline void
update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+ struct btrfs_delayed_ref_node *update,
+ int *old_ref_mod_ret)
{
struct btrfs_delayed_ref_head *existing_ref;
struct btrfs_delayed_ref_head *ref;
@@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing_ref->total_ref_mod;
+ if (old_ref_mod_ret)
+ *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing_ref->total_ref_mod += update->ref_mod;
@@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
- int action, int is_data, int *qrecord_inserted_ret)
+ int action, int is_data, int *qrecord_inserted_ret,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -590,7 +594,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = count_mod;
@@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
if (existing) {
WARN_ON(ref_root && reserved && existing->qgroup_ref_root
&& existing->qgroup_reserved);
- update_existing_head_ref(delayed_refs, &existing->node, ref);
+ update_existing_head_ref(delayed_refs, &existing->node, ref,
+ old_ref_mod);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ if (old_ref_mod)
+ *old_ref_mod = 0;
if (is_data && count_mod < 0)
delayed_refs->pending_csums += num_bytes;
delayed_refs->num_heads++;
@@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
+ if (new_ref_mod)
+ *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -682,7 +691,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
@@ -739,7 +748,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
seq = atomic64_read(&fs_info->tree_mod_seq);
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
@@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, 0, 0, action, 0,
- &qrecord_inserted);
+ &qrecord_inserted, old_ref_mod,
+ new_ref_mod);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action);
@@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action)
+ u64 owner, u64 offset, u64 reserved, int action,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, ref_root, reserved,
- action, 1, &qrecord_inserted);
+ action, 1, &qrecord_inserted,
+ old_ref_mod, new_ref_mod);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
@@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data, NULL);
+ extent_op->is_data, NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0e537f98f1a1..ce88e4ac5276 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,6 +18,8 @@
#ifndef __DELAYED_REF__
#define __DELAYED_REF__
+#include <linux/refcount.h>
+
/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
@@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node {
u64 seq;
/* ref count on this data structure */
- atomic_t refs;
+ refcount_t refs;
/*
* how many refs is this entry adding or deleting. For
@@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
- WARN_ON(atomic_read(&ref->refs) == 0);
- if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(refcount_read(&ref->refs) == 0);
+ if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree);
switch (ref->type) {
case BTRFS_TREE_BLOCK_REF_KEY:
@@ -245,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op);
+ struct btrfs_delayed_extent_op *extent_op,
+ int *old_ref_mod, int *new_ref_mod);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action);
+ u64 owner, u64 offset, u64 reserved, int action,
+ int *old_ref_mod, int *new_ref_mod);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e653921f05d9..bee3edeea7a3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+ btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return scrub_ret;
@@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
srcdev = dev_replace->srcdev;
- args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
@@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_unlock(dev_replace, 1);
- WARN_ON(atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
@@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data)
(unsigned int)progress);
}
btrfs_dev_replace_continue_on_mount(fs_info);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 60a750678a82..41cb9196eaa8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, dir_item))
- return NULL;
total_len = btrfs_item_size_nr(leaf, path->slots[0]);
while (cur < total_len) {
@@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
btrfs_dir_data_len(leaf, dir_item);
name_ptr = (unsigned long)(dir_item + 1);
+ if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
+ return NULL;
if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
return dir_item;
@@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
int verify_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
+ int slot,
struct btrfs_dir_item *dir_item)
{
u16 namelen = BTRFS_NAME_LEN;
+ int ret;
u8 type = btrfs_dir_type(leaf, dir_item);
if (type >= BTRFS_FT_MAX) {
@@ -468,10 +470,16 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
btrfs_crit(fs_info, "invalid dir item name len: %u",
- (unsigned)btrfs_dir_data_len(leaf, dir_item));
+ (unsigned)btrfs_dir_name_len(leaf, dir_item));
return 1;
}
+ namelen = btrfs_dir_name_len(leaf, dir_item);
+ ret = btrfs_is_name_len_valid(leaf, slot,
+ (unsigned long)(dir_item + 1), namelen);
+ if (!ret)
+ return 1;
+
/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
if ((btrfs_dir_data_len(leaf, dir_item) +
btrfs_dir_name_len(leaf, dir_item)) >
@@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
return 0;
}
+
+bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
+ unsigned long start, u16 name_len)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_key key;
+ u32 read_start;
+ u32 read_end;
+ u32 item_start;
+ u32 item_end;
+ u32 size;
+ bool ret = true;
+
+ ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
+
+ read_start = start - BTRFS_LEAF_DATA_OFFSET;
+ read_end = read_start + name_len;
+ item_start = btrfs_item_offset_nr(leaf, slot);
+ item_end = btrfs_item_end_nr(leaf, slot);
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ switch (key.type) {
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ size = sizeof(struct btrfs_dir_item);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ size = sizeof(struct btrfs_inode_ref);
+ break;
+ case BTRFS_INODE_EXTREF_KEY:
+ size = sizeof(struct btrfs_inode_extref);
+ break;
+ case BTRFS_ROOT_REF_KEY:
+ case BTRFS_ROOT_BACKREF_KEY:
+ size = sizeof(struct btrfs_root_ref);
+ break;
+ default:
+ ret = false;
+ goto out;
+ }
+
+ if (read_start < item_start) {
+ ret = false;
+ goto out;
+ }
+ if (read_end > item_end) {
+ ret = false;
+ goto out;
+ }
+
+ /* there shall be item(s) before name */
+ if (read_start - item_start < size) {
+ ret = false;
+ goto out;
+ }
+
+out:
+ if (!ret)
+ btrfs_crit(fs_info, "invalid dir item name len: %u",
+ (unsigned int)name_len);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 061c1d1f774f..086dcbadce09 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -87,9 +87,8 @@ struct btrfs_end_io_wq {
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
- int error;
+ blk_status_t status;
enum btrfs_wq_endio_type metadata;
- struct list_head list;
struct btrfs_work work;
};
@@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- struct inode *inode;
+ void *private_data;
+ struct btrfs_fs_info *fs_info;
struct bio *bio;
- struct list_head list;
extent_submit_bio_hook_t *submit_bio_start;
extent_submit_bio_hook_t *submit_bio_done;
int mirror_num;
@@ -131,7 +130,7 @@ struct async_submit_bio {
*/
u64 bio_offset;
struct btrfs_work work;
- int error;
+ blk_status_t status;
};
/*
@@ -762,7 +761,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(fs_info, eb, ret);
+ btree_readahead_hook(eb, ret);
if (ret) {
/*
@@ -787,7 +786,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb->fs_info, eb, -EIO);
+ btree_readahead_hook(eb, -EIO);
return -EIO; /* we fixed nothing */
}
@@ -799,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio)
btrfs_work_func_t func;
fs_info = end_io_wq->info;
- end_io_wq->error = bio->bi_error;
+ end_io_wq->status = bio->bi_status;
if (bio_op(bio) == REQ_OP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -836,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio)
btrfs_queue_work(wq, &end_io_wq->work);
}
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata)
{
struct btrfs_end_io_wq *end_io_wq;
end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
end_io_wq->private = bio->bi_private;
end_io_wq->end_io = bio->bi_end_io;
end_io_wq->info = info;
- end_io_wq->error = 0;
+ end_io_wq->status = 0;
end_io_wq->bio = bio;
end_io_wq->metadata = metadata;
@@ -868,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
- int ret;
+ blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->bio,
+ ret = async->submit_bio_start(async->private_data, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
- async->error = ret;
+ async->status = ret;
}
static void run_one_async_done(struct btrfs_work *work)
@@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work)
int limit;
async = container_of(work, struct async_submit_bio, work);
- fs_info = BTRFS_I(async->inode)->root->fs_info;
+ fs_info = async->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
@@ -898,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
/* If an error occurred we just want to clean up the bio and move on */
- if (async->error) {
- async->bio->bi_error = async->error;
+ if (async->status) {
+ async->bio->bi_status = async->status;
bio_endio(async->bio);
return;
}
- async->submit_bio_done(async->inode, async->bio, async->mirror_num,
+ async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
async->bio_flags, async->bio_offset);
}
@@ -916,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags,
- u64 bio_offset,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done)
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset, void *private_data,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
{
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
- async->inode = inode;
+ async->private_data = private_data;
+ async->fs_info = fs_info;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
- async->error = 0;
+ async->status = 0;
atomic_inc(&fs_info->nr_async_submits);
@@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
return 0;
}
-static int btree_csum_one_bio(struct bio *bio)
+static blk_status_t btree_csum_one_bio(struct bio *bio)
{
struct bio_vec *bvec;
struct btrfs_root *root;
@@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio)
break;
}
- return ret;
+ return errno_to_blk_status(ret);
}
-static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -986,11 +985,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
return btree_csum_one_bio(bio);
}
-static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
- int ret;
+ struct inode *inode = private_data;
+ blk_status_t ret;
/*
* when we're called for a write, we're already in the async
@@ -998,7 +998,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
*/
ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -1015,13 +1015,14 @@ static int check_async_write(unsigned long bio_flags)
return 1;
}
-static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(bio_flags);
- int ret;
+ blk_status_t ret;
if (bio_op(bio) != REQ_OP_WRITE) {
/*
@@ -1043,8 +1044,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0,
- bio_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
+ bio_offset, private_data,
__btree_submit_bio_start,
__btree_submit_bio_done);
}
@@ -1054,7 +1055,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
return 0;
out_w_error:
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
return ret;
}
@@ -1222,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
buf->start + buf->len - 1);
}
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return filemap_fdatawait_range(buf->pages[0]->mapping,
- buf->start, buf->start + buf->len - 1);
+ filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
}
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -1255,9 +1256,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
btrfs_assert_tree_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- -buf->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(buf);
@@ -1340,15 +1341,14 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- atomic_set(&root->refs, 1);
+ refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
if (!dummy)
- extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&root->dirty_log_pages, NULL);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1820,7 +1820,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
- bio->bi_error = end_io_wq->error;
+ bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
@@ -2309,7 +2309,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
inode->i_mapping->a_ops = &btree_aops;
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
BTRFS_I(inode)->io_tree.track_uptodate = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
@@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
- spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
@@ -2662,12 +2661,11 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
- fs_info->fs_frozen = 0;
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
- fs_info->free_chunk_space = 0;
+ atomic64_set(&fs_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
@@ -2704,10 +2702,8 @@ int open_ctree(struct super_block *sb,
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
- extent_io_tree_init(&fs_info->freed_extents[0],
- fs_info->btree_inode->i_mapping);
- extent_io_tree_init(&fs_info->freed_extents[1],
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
@@ -3467,10 +3463,12 @@ static int write_dev_supers(struct btrfs_device *device,
* we fua the first super. The others we allow
* to go down lazy.
*/
- if (i == 0)
- ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
- else
+ if (i == 0) {
+ ret = btrfsic_submit_bh(REQ_OP_WRITE,
+ REQ_SYNC | REQ_FUA, bh);
+ } else {
ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+ }
if (ret)
errors++;
}
@@ -3483,64 +3481,61 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
- if (bio->bi_private)
- complete(bio->bi_private);
- bio_put(bio);
+ complete(bio->bi_private);
}
/*
- * trigger flushes for one the devices. If you pass wait == 0, the flushes are
- * sent down. With wait == 1, it waits for the previous flush.
- *
- * any device where the flush fails with eopnotsupp are flagged as not-barrier
- * capable
+ * Submit a flush request to the device if it supports it. Error handling is
+ * done in the waiting counterpart.
*/
-static int write_dev_flush(struct btrfs_device *device, int wait)
+static void write_dev_flush(struct btrfs_device *device)
{
- struct bio *bio;
- int ret = 0;
+ struct request_queue *q = bdev_get_queue(device->bdev);
+ struct bio *bio = device->flush_bio;
- if (device->nobarriers)
- return 0;
+ if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ return;
- if (wait) {
- bio = device->flush_bio;
- if (!bio)
- return 0;
+ bio_reset(bio);
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
- wait_for_completion(&device->flush_wait);
+ submit_bio(bio);
+ device->flush_bio_sent = 1;
+}
- if (bio->bi_error) {
- ret = bio->bi_error;
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
+/*
+ * If the flush bio has been submitted by write_dev_flush, wait for it.
+ */
+static blk_status_t wait_dev_flush(struct btrfs_device *device)
+{
+ struct bio *bio = device->flush_bio;
- /* drop the reference from the wait == 0 run */
- bio_put(bio);
- device->flush_bio = NULL;
+ if (!device->flush_bio_sent)
+ return 0;
- return ret;
- }
+ device->flush_bio_sent = 0;
+ wait_for_completion_io(&device->flush_wait);
- /*
- * one reference for us, and we leave it for the
- * caller
- */
- device->flush_bio = NULL;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- return -ENOMEM;
+ return bio->bi_status;
+}
- bio->bi_end_io = btrfs_end_empty_barrier;
- bio->bi_bdev = device->bdev;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
- init_completion(&device->flush_wait);
- bio->bi_private = &device->flush_wait;
- device->flush_bio = bio;
+static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
+{
+ int dev_flush_error = 0;
+ struct btrfs_device *dev;
+
+ list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
+ if (!dev->bdev || dev->last_flush_error)
+ dev_flush_error++;
+ }
- bio_get(bio);
- btrfsic_submit_bio(bio);
+ if (dev_flush_error >
+ fsdevs->fs_info->num_tolerated_disk_barrier_failures)
+ return -EIO;
return 0;
}
@@ -3553,25 +3548,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
- int errors_send = 0;
int errors_wait = 0;
- int ret;
+ blk_status_t ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (dev->missing)
continue;
- if (!dev->bdev) {
- errors_send++;
+ if (!dev->bdev)
continue;
- }
if (!dev->in_fs_metadata || !dev->writeable)
continue;
- ret = write_dev_flush(dev, 0);
- if (ret)
- errors_send++;
+ write_dev_flush(dev);
+ dev->last_flush_error = 0;
}
/* wait for all the barriers */
@@ -3585,13 +3576,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
if (!dev->in_fs_metadata || !dev->writeable)
continue;
- ret = write_dev_flush(dev, 1);
- if (ret)
+ ret = wait_dev_flush(dev);
+ if (ret) {
+ dev->last_flush_error = ret;
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
errors_wait++;
+ }
+ }
+
+ if (errors_wait) {
+ /*
+ * At some point we need the status of all disks
+ * to arrive at the volume status. So error checking
+ * is being pushed to a separate loop.
+ */
+ return check_barrier_error(info->fs_devices);
}
- if (errors_send > info->num_tolerated_disk_barrier_failures ||
- errors_wait > info->num_tolerated_disk_barrier_failures)
- return -EIO;
return 0;
}
@@ -4046,9 +4047,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->start, transid, fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty)
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- buf->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ buf->len,
+ fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
btrfs_print_leaf(fs_info, buf);
@@ -4321,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -4575,11 +4576,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
-
- /*
- memset(cur_trans, 0, sizeof(*cur_trans));
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- */
}
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
@@ -4593,7 +4589,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
if (t->state >= TRANS_STATE_COMMIT_START) {
- atomic_inc(&t->use_count);
+ refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
btrfs_put_transaction(t);
@@ -4635,6 +4631,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
+static struct btrfs_fs_info *btree_fs_info(void *private_data)
+{
+ struct inode *inode = private_data;
+ return btrfs_sb(inode->i_sb);
+}
+
static const struct extent_io_ops btree_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btree_submit_bio_hook,
@@ -4642,6 +4644,8 @@ static const struct extent_io_ops btree_extent_io_ops = {
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
+ .set_range_writeback = btrfs_set_range_writeback,
+ .tree_fs_info = btree_fs_info,
/* optional callbacks */
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e0ec29bfd69..0a634d3ffc16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
*/
static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
{
- if (atomic_inc_not_zero(&root->refs))
+ if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
static inline void btrfs_put_fs_root(struct btrfs_root *root)
{
- if (atomic_dec_and_test(&root->refs))
+ if (refcount_dec_and_test(&root->refs))
kfree(root);
}
@@ -118,16 +118,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, u8 *result);
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags, u64 bio_offset,
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset, void *private_data,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 87144c9f9593..fa66980726c9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
name_len = btrfs_inode_ref_name_len(leaf, iref);
}
+ ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
+ if (!ret) {
+ btrfs_free_path(path);
+ return -EIO;
+ }
read_extent_buffer(leaf, name, name_ptr, name_len);
btrfs_free_path(path);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be5477676cc8..375f8c728d91 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_root *root,
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush);
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk);
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
@@ -131,6 +132,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
+
+ /*
+ * If not empty, someone is still holding mutex of
+ * full_stripe_lock, which can only be released by caller.
+ * And it will definitely cause use-after-free when caller
+ * tries to release full stripe lock.
+ *
+ * No better way to resolve, but only to warn.
+ */
+ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
kfree(cache);
}
@@ -316,14 +327,14 @@ get_caching_control(struct btrfs_block_group_cache *cache)
}
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
spin_unlock(&cache->lock);
return ctl;
}
static void put_caching_control(struct btrfs_caching_control *ctl)
{
- if (atomic_dec_and_test(&ctl->count))
+ if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
}
@@ -599,7 +610,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
- atomic_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 1);
btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
caching_thread, NULL, NULL);
@@ -620,7 +631,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_caching_control *ctl;
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&cache->lock);
@@ -707,7 +718,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
down_write(&fs_info->commit_root_sem);
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->commit_root_sem);
@@ -756,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
/*
* after adding space to the filesystem, we need to clear the full flags
* on all the space infos.
@@ -892,7 +923,7 @@ search_again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -2082,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
+ int old_ref_mod, new_ref_mod;
int ret;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
@@ -2089,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL);
+ num_bytes, parent,
+ root_objectid, (int)owner,
+ BTRFS_ADD_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes, parent, root_objectid,
- owner, offset, 0,
- BTRFS_ADD_DELAYED_REF);
+ num_bytes, parent,
+ root_objectid, owner, offset,
+ 0, BTRFS_ADD_DELAYED_REF,
+ &old_ref_mod, &new_ref_mod);
}
+
+ if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
+ add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+
return ret;
}
@@ -2401,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
head = btrfs_delayed_node_to_head(node);
trace_run_delayed_ref_head(fs_info, node, head, node->action);
+ if (head->total_ref_mod < 0) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, node->bytenr);
+ ASSERT(cache);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ -node->num_bytes);
+ btrfs_put_block_group(cache);
+ }
+
if (insert_reserved) {
btrfs_pin_extent(fs_info, node->bytenr,
node->num_bytes, 1);
@@ -2980,7 +3028,7 @@ again:
struct btrfs_delayed_ref_node *ref;
ref = &head->node;
- atomic_inc(&ref->refs);
+ refcount_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
/*
@@ -3003,7 +3051,6 @@ again:
goto again;
}
out:
- assert_qgroups_uptodate(trans);
trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3057,7 +3104,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -3355,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
+ struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
u64 num_pages = 0;
@@ -3443,7 +3491,8 @@ again:
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
- * b) we're with nospace_cache mount option.
+ * b) we're with nospace_cache mount option,
+ * c) we're with v2 space_cache (FREE_SPACE_TREE).
*/
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3473,7 +3522,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_SIZE;
- ret = btrfs_check_data_free_space(inode, 0, num_pages);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
if (ret)
goto out_put;
@@ -3504,6 +3553,7 @@ out:
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -3914,87 +3964,83 @@ static const char *alloc_name(u64 flags)
};
}
-static int update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
- struct btrfs_space_info **space_info)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags,
+ struct btrfs_space_info **new)
{
- struct btrfs_space_info *found;
+
+ struct btrfs_space_info *space_info;
int i;
- int factor;
int ret;
- if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- factor = 2;
- else
- factor = 1;
-
- found = __find_space_info(info, flags);
- if (found) {
- spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- if (total_bytes > 0)
- found->full = 0;
- space_info_add_new_bytes(info, found, total_bytes -
- bytes_used - bytes_readonly);
- spin_unlock(&found->lock);
- *space_info = found;
- return 0;
- }
- found = kzalloc(sizeof(*found), GFP_NOFS);
- if (!found)
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
+ GFP_KERNEL);
if (ret) {
- kfree(found);
+ kfree(space_info);
return ret;
}
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- INIT_LIST_HEAD(&found->block_groups[i]);
- init_rwsem(&found->groups_sem);
- spin_lock_init(&found->lock);
- found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- found->total_bytes = total_bytes;
- found->disk_total = total_bytes * factor;
- found->bytes_used = bytes_used;
- found->disk_used = bytes_used * factor;
- found->bytes_pinned = 0;
- found->bytes_reserved = 0;
- found->bytes_readonly = bytes_readonly;
- found->bytes_may_use = 0;
- found->full = 0;
- found->max_extent_size = 0;
- found->force_alloc = CHUNK_ALLOC_NO_FORCE;
- found->chunk_alloc = 0;
- found->flush = 0;
- init_waitqueue_head(&found->wait);
- INIT_LIST_HEAD(&found->ro_bgs);
- INIT_LIST_HEAD(&found->tickets);
- INIT_LIST_HEAD(&found->priority_tickets);
-
- ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+ INIT_LIST_HEAD(&space_info->block_groups[i]);
+ init_rwsem(&space_info->groups_sem);
+ spin_lock_init(&space_info->lock);
+ space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ init_waitqueue_head(&space_info->wait);
+ INIT_LIST_HEAD(&space_info->ro_bgs);
+ INIT_LIST_HEAD(&space_info->tickets);
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+
+ ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
- alloc_name(found->flags));
+ alloc_name(space_info->flags));
if (ret) {
- kfree(found);
+ percpu_counter_destroy(&space_info->total_bytes_pinned);
+ kfree(space_info);
return ret;
}
- *space_info = found;
- list_add_rcu(&found->list, &info->space_info);
+ *new = space_info;
+ list_add_rcu(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
- info->data_sinfo = found;
+ info->data_sinfo = space_info;
return ret;
}
+static void update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+ int factor;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+
+ found = __find_space_info(info, flags);
+ ASSERT(found);
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
+ found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
+ if (total_bytes > 0)
+ found->full = 0;
+ space_info_add_new_bytes(info, found, total_bytes -
+ bytes_used - bytes_readonly);
+ spin_unlock(&found->lock);
+ *space_info = found;
+}
+
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 extra_flags = chunk_to_extended(flags) &
@@ -4110,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return btrfs_reduce_alloc_profile(fs_info, flags);
}
-u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 flags;
@@ -4127,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
+u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
+}
+
+u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+}
+
+u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+}
+
static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
@@ -4176,7 +4237,7 @@ again:
data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&data_sinfo->lock);
alloc:
- alloc_target = btrfs_get_alloc_profile(root, 1);
+ alloc_target = btrfs_data_alloc_profile(fs_info);
/*
* It is ugly that we don't call nolock join
* transaction for the free space inode case here.
@@ -4227,7 +4288,7 @@ commit_trans:
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, 0, -1);
- btrfs_wait_ordered_roots(fs_info, -1, 0,
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
(u64)-1);
}
@@ -4267,12 +4328,8 @@ commit_trans:
return ret;
}
-/*
- * New check_data_free_space() with ability for precious data reservation
- * Will replace old btrfs_check_data_free_space(), but for patch split,
- * add a new function first and then replace it.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
@@ -4287,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(inode, start, len);
- if (ret)
+ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
+ else
+ ret = 0;
return ret;
}
@@ -4330,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4340,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
start = round_down(start, root->fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(inode, start, len);
- btrfs_qgroup_free_data(inode, start, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len);
}
static void force_metadata_allocation(struct btrfs_fs_info *info)
@@ -4452,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
}
if (left < thresh) {
- u64 flags;
+ u64 flags = btrfs_system_alloc_profile(fs_info);
- flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
@@ -4495,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info = __find_space_info(fs_info, flags);
if (!space_info) {
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
- BUG_ON(ret); /* -ENOMEM */
+ ret = create_space_info(fs_info, flags, &space_info);
+ if (ret)
+ return ret;
}
- BUG_ON(!space_info); /* Logic error */
again:
spin_lock(&space_info->lock);
@@ -4603,11 +4662,11 @@ out:
return ret;
}
-static int can_overcommit(struct btrfs_root *root,
+static int can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 profile;
u64 space_size;
@@ -4618,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
- profile = btrfs_get_alloc_profile(root, 0);
+ if (system_chunk)
+ profile = btrfs_system_alloc_profile(fs_info);
+ else
+ profile = btrfs_metadata_alloc_profile(fs_info);
+
used = btrfs_space_info_used(space_info, false);
/*
@@ -4635,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root,
used += space_info->bytes_may_use;
- spin_lock(&fs_info->free_chunk_lock);
- avail = fs_info->free_chunk_space;
- spin_unlock(&fs_info->free_chunk_lock);
+ avail = atomic64_read(&fs_info->free_chunk_space);
/*
* If we have dup, raid1 or raid10 then only half of the free
@@ -4687,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
}
}
-static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
u64 bytes;
- int nr;
+ u64 nr;
bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- nr = (int)div64_u64(to_reclaim, bytes);
+ nr = div64_u64(to_reclaim, bytes);
if (!nr)
nr = 1;
return nr;
@@ -4705,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
- bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
+ u64 orig, bool wait_ordered)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 max_reclaim;
+ u64 items;
long time_left;
unsigned long nr_pages;
int loops;
- int items;
enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &fs_info->delalloc_block_rsv;
@@ -4765,7 +4825,7 @@ skip_async:
else
flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (can_overcommit(root, space_info, orig, flush)) {
+ if (can_overcommit(fs_info, space_info, orig, flush, false)) {
spin_unlock(&space_info->lock);
break;
}
@@ -4827,14 +4887,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_rsv->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes - delayed_rsv->size) >= 0) {
+ bytes - delayed_rsv->size) < 0) {
spin_unlock(&delayed_rsv->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
commit:
- trans = btrfs_join_transaction(fs_info->fs_root);
+ trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return -ENOSPC;
@@ -4852,7 +4912,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
u64 orig_bytes, int state)
{
- struct btrfs_root *root = fs_info->fs_root;
+ struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -4875,7 +4935,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes * 2, orig_bytes,
+ shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case ALLOC_CHUNK:
@@ -4885,7 +4945,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = do_chunk_alloc(trans, fs_info,
- btrfs_get_alloc_profile(root, 0),
+ btrfs_metadata_alloc_profile(fs_info),
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
@@ -4906,8 +4966,9 @@ static int flush_space(struct btrfs_fs_info *fs_info,
}
static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
- struct btrfs_space_info *space_info)
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool system_chunk)
{
struct reserve_ticket *ticket;
u64 used;
@@ -4922,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
return to_reclaim;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(fs_info, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
return 0;
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
- if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
+ used = btrfs_space_info_used(space_info, true);
+
+ if (can_overcommit(fs_info, space_info, SZ_1M,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4943,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
return to_reclaim;
}
-static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_root *root, u64 used)
+static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 used, bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- if (!btrfs_calc_reclaim_metadata_size(root, space_info))
+ if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ system_chunk))
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -4990,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
@@ -5013,8 +5075,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+ space_info,
+ false);
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
if (last_tickets_id == space_info->tickets_id) {
@@ -5052,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int flush_state = FLUSH_DELAYED_ITEMS_NR;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
@@ -5132,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_root *root,
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -5159,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
- } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
+ } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
+ system_chunk)) {
space_info->bytes_may_use += orig_bytes;
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
@@ -5186,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
orig_bytes, flush,
"enospc");
queue_work(system_unbound_wq,
- &root->fs_info->async_reclaim_work);
+ &fs_info->async_reclaim_work);
}
} else {
list_add_tail(&ticket.list,
@@ -5200,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(space_info, root, used) &&
+ need_do_async_reclaim(fs_info, space_info,
+ used, system_chunk) &&
!work_busy(&fs_info->async_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
@@ -5258,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
+ bool system_chunk = (root == fs_info->chunk_root);
- ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
- flush);
+ ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ orig_bytes, flush, system_chunk);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -5369,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
* overcommit, and if we can't then we just need to free up our space
* and not satisfy any requests.
*/
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ used = btrfs_space_info_used(space_info, true);
if (used - num_bytes >= space_info->total_bytes)
check_overcommit = true;
again:
@@ -5383,8 +5447,7 @@ again:
* adding the ticket space would be a double count.
*/
if (check_overcommit &&
- !can_overcommit(fs_info->extent_root, space_info, 0,
- flush))
+ !can_overcommit(fs_info, space_info, 0, flush, false))
break;
if (num_bytes >= ticket->bytes) {
list_del_init(&ticket->list);
@@ -6113,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* @inode: inode we're writing to
* @start: start range we are writing to
* @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
*
* This will do the following things
*
@@ -6130,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
if (ret < 0)
- btrfs_free_reserved_data_space(inode, start, len);
+ btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
}
@@ -6158,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
- btrfs_free_reserved_data_space(inode, start, len);
+ btrfs_free_reserved_data_space(inode, reserved, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6237,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(info, "pinned",
cache->space_info->flags,
num_bytes, 1);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ num_bytes);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -6313,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
trace_btrfs_space_reservation(fs_info, "pinned",
cache->space_info->flags, num_bytes, 1);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
set_extent_dirty(fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -6783,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
- u64 owner, u64 root_objectid)
-{
- struct btrfs_space_info *space_info;
- u64 flags;
-
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- } else {
- flags = BTRFS_BLOCK_GROUP_DATA;
- }
-
- space_info = __find_space_info(fs_info, flags);
- BUG_ON(!space_info); /* Logic bug */
- percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
-}
-
-
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info,
struct btrfs_delayed_ref_node *node, u64 parent,
@@ -7026,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
- add_pinned_bytes(info, -num_bytes, owner_objectid,
- root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
@@ -7159,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(fs_info, trans,
- buf->start, buf->len,
- parent,
+ int old_ref_mod, new_ref_mod;
+
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
+ buf->len, parent,
root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
+ BTRFS_DROP_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
BUG_ON(ret); /* -ENOMEM */
+ pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
- if (!last_ref)
- return;
-
- if (btrfs_header_generation(buf) == trans->transid) {
+ if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group_cache *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -7180,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
+ pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -7195,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
- pin = 0;
}
out:
if (pin)
add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
root->root_key.objectid);
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't matter
- * anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ if (last_ref) {
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ }
}
/* Can return -ENOMEM */
@@ -7215,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
+ int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
- add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
/*
* tree log blocks never actually go into the extent allocation
@@ -7230,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
+ old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL);
+ num_bytes, parent,
+ root_objectid, (int)owner,
+ BTRFS_DROP_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner,
- offset, 0,
- BTRFS_DROP_DELAYED_REF);
+ num_bytes, parent,
+ root_objectid, owner, offset,
+ 0, BTRFS_DROP_DELAYED_REF,
+ &old_ref_mod, &new_ref_mod);
}
+
+ if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
+ add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+
return ret;
}
@@ -7945,7 +8000,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 flags;
int ret;
- flags = btrfs_get_alloc_profile(root, is_data);
+ flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
@@ -8189,9 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
- ins->offset, 0,
- root_objectid, owner, offset,
- ram_bytes, BTRFS_ADD_DELAYED_EXTENT);
+ ins->offset, 0, root_objectid, owner,
+ offset, ram_bytes,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
return ret;
}
@@ -8411,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
- ret = btrfs_add_delayed_tree_ref(fs_info, trans,
- ins.objectid, ins.offset,
- parent, root_objectid, level,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
+ ins.offset, parent,
+ root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
+ extent_op, NULL, NULL);
if (ret)
goto out_free_delayed;
}
@@ -9917,6 +9972,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
mutex_init(&cache->free_space_lock);
+ btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache;
}
@@ -10047,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
}
trace_btrfs_add_block_group(info, cache, 0);
- ret = update_space_info(info, cache->flags, found_key.offset,
- btrfs_block_group_used(&cache->item),
- cache->bytes_super, &space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- spin_lock(&info->block_group_cache_lock);
- rb_erase(&cache->cache_node,
- &info->block_group_cache_tree);
- RB_CLEAR_NODE(&cache->cache_node);
- spin_unlock(&info->block_group_cache_lock);
- btrfs_put_block_group(cache);
- goto error;
- }
+ update_space_info(info, cache->flags, found_key.offset,
+ btrfs_block_group_used(&cache->item),
+ cache->bytes_super, &space_info);
cache->space_info = space_info;
@@ -10191,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
}
#endif
/*
- * Call to ensure the corresponding space_info object is created and
- * assigned to our block group, but don't update its counters just yet.
- * We want our bg to be added to the rbtree with its ->space_info set.
+ * Ensure the corresponding space_info object is created and
+ * assigned to our block group. We want our bg to be added to the rbtree
+ * with its ->space_info set.
*/
- ret = update_space_info(fs_info, cache->flags, 0, 0, 0,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
+ cache->space_info = __find_space_info(fs_info, cache->flags);
+ if (!cache->space_info) {
+ ret = create_space_info(fs_info, cache->flags,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
}
ret = btrfs_add_block_group_cache(fs_info, cache);
@@ -10215,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- ret = update_space_info(fs_info, cache->flags, size, bytes_used,
+ update_space_info(fs_info, cache->flags, size, bytes_used,
cache->bytes_super, &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&cache->cache_node,
- &fs_info->block_group_cache_tree);
- RB_CLEAR_NODE(&cache->cache_node);
- spin_unlock(&fs_info->block_group_cache_lock);
- btrfs_put_block_group(cache);
- return ret;
- }
update_global_block_rsv(fs_info);
__link_block_group(cache->space_info, cache);
@@ -10416,7 +10455,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->caching_block_groups, list)
if (ctl->block_group == block_group) {
caching_ctl = ctl;
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
break;
}
}
@@ -10774,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
}
out:
return ret;
@@ -10850,7 +10889,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ret = find_free_dev_extent_start(trans, device, minlen, start,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 27fdb250b446..556484cf5d93 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
- atomic_read(&state->refs));
+ refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
- struct inode *inode;
- u64 isize;
-
- if (!tree->mapping)
- return;
-
- inode = tree->mapping->host;
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
+ if (tree->ops && tree->ops->check_extent_io_range)
+ tree->ops->check_extent_io_range(tree->private_data, caller,
+ start, end);
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
- if (!tree->mapping)
- return NULL;
- return btrfs_sb(tree->mapping->host->i_sb);
+ if (tree->ops)
+ return tree->ops->tree_fs_info(tree->private_data);
+ return NULL;
}
int __init extent_io_init(void)
@@ -174,7 +164,8 @@ int __init extent_io_init(void)
goto free_state_cache;
btrfs_bioset = bioset_create(BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio));
+ offsetof(struct btrfs_io_bio, bio),
+ BIOSET_NEED_BVECS);
if (!btrfs_bioset)
goto free_buffer_cache;
@@ -213,13 +204,13 @@ void extent_io_exit(void)
}
void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping)
+ void *private_data)
{
tree->state = RB_ROOT;
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
- tree->mapping = mapping;
+ tree->private_data = private_data;
}
static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -238,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
- atomic_set(&state->refs, 1);
+ refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
@@ -248,7 +239,7 @@ void free_extent_state(struct extent_state *state)
{
if (!state)
return;
- if (atomic_dec_and_test(&state->refs)) {
+ if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
@@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
- tree->ops->merge_extent_hook(tree->mapping->host, new,
- other);
+ tree->ops->merge_extent_hook(tree->private_data, new, other);
}
/*
@@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
- tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+ tree->ops->set_bit_hook(tree->private_data, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
- tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host),
- state, bits);
+ tree->ops->clear_bit_hook(tree->private_data, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
- tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+ tree->ops->split_extent_hook(tree->private_data, orig, split);
}
/*
@@ -641,7 +630,7 @@ again:
if (cached && extent_state_in_tree(cached) &&
cached->start <= start && cached->end > start) {
if (clear)
- atomic_dec(&cached->refs);
+ refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
@@ -793,7 +782,7 @@ process_node:
if (state->state & bits) {
start = state->start;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
@@ -834,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state,
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
*cached_ptr = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
}
}
@@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
*/
static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
- put_page(page);
- index++;
- }
+ tree->ops->set_range_writeback(tree->private_data, start, end);
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1538,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
if (!found) {
*start = state->start;
*cached_state = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
found++;
*end = state->end;
@@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec)
{
int ret;
int err = 0;
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
@@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
if (ret)
err = ret;
- ret = clear_extent_bits(&inode->io_tree, rec->start,
+ ret = clear_extent_bits(io_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_DAMAGED);
if (ret && !err)
@@ -1994,29 +1974,21 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
- u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
- /* we can't repair anything in raid56 yet */
- if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
- return 0;
-
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2026,17 +1998,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bbio, 0);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ ASSERT(bbio->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
}
- BUG_ON(mirror_num != bbio->mirror_num);
- sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+ sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[mirror_num-1].dev;
+ dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
btrfs_bio_counter_dec(fs_info);
@@ -2057,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- btrfs_ino(inode), start,
+ ino, start,
rcu_str_deref(dev->name), sector);
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
@@ -2077,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
- ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start,
- PAGE_SIZE, start, p,
+ ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
@@ -2092,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
- unsigned int pg_offset)
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset)
{
u64 private;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *state;
int num_copies;
int ret;
private = 0;
- ret = count_range_bits(&inode->io_failure_tree, &private,
- (u64)-1, 1, EXTENT_DIRTY, 0);
+ ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
+ EXTENT_DIRTY, 0);
if (!ret)
return 0;
- ret = get_state_failrec(&inode->io_failure_tree, start,
- &failrec);
+ ret = get_state_failrec(failure_tree, start, &failrec);
if (ret)
return 0;
@@ -2125,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
if (fs_info->sb->s_flags & MS_RDONLY)
goto out;
- spin_lock(&inode->io_tree.lock);
- state = find_first_extent_bit_state(&inode->io_tree,
+ spin_lock(&io_tree->lock);
+ state = find_first_extent_bit_state(io_tree,
failrec->start,
EXTENT_LOCKED);
- spin_unlock(&inode->io_tree.lock);
+ spin_unlock(&io_tree->lock);
if (state && state->start <= failrec->start &&
state->end >= failrec->start + failrec->len - 1) {
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- repair_io_failure(inode, start, failrec->len,
- failrec->logical, page,
- pg_offset, failrec->failed_mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset,
+ failrec->failed_mirror);
}
}
out:
- free_io_failure(inode, failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return 0;
}
@@ -2343,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct btrfs_io_bio *btrfs_failed_bio;
struct btrfs_io_bio *btrfs_bio;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return NULL;
-
+ bio = btrfs_io_bio_alloc(1);
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2384,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct io_failure_record *failrec;
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *bio;
int read_mode = 0;
+ blk_status_t status;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2396,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
if (!ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
@@ -2409,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
(int)phy_offset, failed_bio->bi_end_io,
NULL);
if (!bio) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -2418,11 +2406,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
read_mode, failrec->this_mirror, failrec->in_validation);
- ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+ status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
failrec->bio_flags, 0);
- if (ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ if (status) {
+ free_io_failure(failure_tree, tree, failrec);
bio_put(bio);
+ ret = blk_status_to_errno(status);
}
return ret;
@@ -2445,7 +2434,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
- ret = ret < 0 ? ret : -EIO;
+ ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
}
@@ -2461,6 +2450,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
*/
static void end_bio_extent_writepage(struct bio *bio)
{
+ int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
u64 end;
@@ -2490,7 +2480,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- end_extent_writepage(page, bio->bi_error, start, end);
+ end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2523,9 +2513,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = !bio->bi_error;
+ int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *tree;
+ struct extent_io_tree *tree, *failure_tree;
u64 offset = 0;
u64 start;
u64 end;
@@ -2543,9 +2533,10 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_error,
+ (u64)bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2575,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio)
if (ret)
uptodate = 0;
else
- clean_io_failure(BTRFS_I(inode), start,
- page, 0);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, tree, start,
+ page,
+ btrfs_ino(BTRFS_I(inode)), 0);
}
if (likely(uptodate))
@@ -2602,7 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio)
ret = bio_readpage_error(bio, offset, page,
start, end, mirror);
if (ret == 0) {
- uptodate = !bio->bi_error;
+ uptodate = !bio->bi_status;
offset += len;
continue;
}
@@ -2660,77 +2653,80 @@ readpage_ok:
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
if (io_bio->end_io)
- io_bio->end_io(io_bio, bio->bi_error);
+ io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
bio_put(bio);
}
/*
- * this allocates from the btrfs_bioset. We're returning a bio right now
- * but you can call btrfs_io_bio for the appropriate container_of magic
+ * Initialize the members up to but not including 'bio'. Use after allocating a
+ * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
+ * 'bio' because use of __GFP_ZERO is not supported.
*/
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags)
+static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
{
- struct btrfs_io_bio *btrfs_bio;
- struct bio *bio;
-
- bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+ memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+}
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2)) {
- bio = bio_alloc_bioset(gfp_flags,
- nr_vecs, btrfs_bioset);
- }
- }
+/*
+ * The following helpers allocate a bio. As it's backed by a bioset, it'll
+ * never fail. We're returning a bio right now but you can call btrfs_io_bio
+ * for the appropriate container_of magic
+ */
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+{
+ struct bio *bio;
- if (bio) {
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = first_sector;
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = first_byte >> 9;
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *btrfs_bio_clone(struct bio *bio)
{
struct btrfs_io_bio *btrfs_bio;
struct bio *new;
- new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
- if (new) {
- btrfs_bio = btrfs_io_bio(new);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_io_bio_init(btrfs_bio);
+ btrfs_bio->iter = bio->bi_iter;
return new;
}
-/* this also allocates from the btrfs_bioset */
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
{
- struct btrfs_io_bio *btrfs_bio;
struct bio *bio;
- bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
- if (bio) {
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_bio;
+
+ /* this will never fail when it's backed by a bioset */
+ bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+ ASSERT(bio);
+
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_io_bio_init(btrfs_bio);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ btrfs_bio->iter = bio->bi_iter;
+ return bio;
+}
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
- int ret = 0;
+ blk_status_t ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
@@ -2742,13 +2738,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
bio_get(bio);
if (tree->ops)
- ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
mirror_num, bio_flags, start);
else
btrfsic_submit_bio(bio);
bio_put(bio);
- return ret;
+ return blk_status_to_errno(ret);
}
static int merge_bio(struct extent_io_tree *tree, struct page *page,
@@ -2805,14 +2801,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
}
}
- bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
- GFP_NOFS | __GFP_HIGH);
- if (!bio)
- return -ENOMEM;
-
+ bio = btrfs_bio_alloc(bdev, sector << 9);
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ bio->bi_write_hint = page->mapping->host->i_write_hint;
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
@@ -2859,7 +2852,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
@@ -2870,7 +2863,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
*em_cached = em;
}
return em;
@@ -3584,9 +3577,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- -eb->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
@@ -3694,7 +3687,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (bio->bi_error ||
+ if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
@@ -3744,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
- end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
memzero_extent_buffer(eb, start, end - start);
}
@@ -4364,6 +4357,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
+/*
+ * To cache previous fiemap extent
+ *
+ * Will be used for merging fiemap extent
+ */
+struct fiemap_cache {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ int ret = 0;
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * Sanity check, extent_fiemap() should have ensured that new
+ * fiemap extent won't overlap with cahced one.
+ * Not recoverable.
+ *
+ * NOTE: Physical address can overlap, due to compression
+ */
+ if (cache->offset + cache->len > offset) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags except FIEMAP_EXTENT_LAST
+ * So regular extent won't get merged with prealloc extent
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ (cache->flags & ~FIEMAP_EXTENT_LAST) ==
+ (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->len += len;
+ cache->flags |= flags;
+ goto try_submit_last;
+ }
+
+ /* Not mergeable, need to submit cached one */
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret)
+ return ret;
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+try_submit_last:
+ if (cache->flags & FIEMAP_EXTENT_LAST) {
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset,
+ cache->phys, cache->len, cache->flags);
+ cache->cached = false;
+ }
+ return ret;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4381,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct fiemap_cache cache = { 0 };
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4560,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
- ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
- em_len, flags);
+ ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
+ em_len, flags);
if (ret) {
if (ret == 1)
ret = 0;
@@ -4569,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
}
out_free:
+ if (!ret)
+ ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
free_extent_map(em);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3e4fad4a909d..3fb8513bf02e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#include "ulist.h"
/* bits for the extent state */
@@ -14,14 +15,17 @@
#define EXTENT_DEFRAG (1U << 6)
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
-#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_CLEAR_META_RESV (1U << 11)
#define EXTENT_FIRST_DELALLOC (1U << 12)
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_CLEAR_DATA_RESV (1U << 17)
+#define EXTENT_DELALLOC_NEW (1U << 18)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
@@ -88,7 +92,7 @@ struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
-typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset);
struct extent_io_ops {
@@ -104,32 +108,36 @@ struct extent_io_ops {
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+ struct btrfs_fs_info *(*tree_fs_info)(void *private_data);
+ void (*set_range_writeback)(void *private_data, u64 start, u64 end);
/*
* Optional hooks, called if the pointer is not NULL
*/
- int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ int (*fill_delalloc)(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ void (*set_bit_hook)(void *private_data, struct extent_state *state,
unsigned *bits);
- void (*clear_bit_hook)(struct btrfs_inode *inode,
+ void (*clear_bit_hook)(void *private_data,
struct extent_state *state,
unsigned *bits);
- void (*merge_extent_hook)(struct inode *inode,
+ void (*merge_extent_hook)(void *private_data,
struct extent_state *new,
struct extent_state *other);
- void (*split_extent_hook)(struct inode *inode,
+ void (*split_extent_hook)(void *private_data,
struct extent_state *orig, u64 split);
+ void (*check_extent_io_range)(void *private_data, const char *caller,
+ u64 start, u64 end);
};
struct extent_io_tree {
struct rb_root state;
- struct address_space *mapping;
+ void *private_data;
u64 dirty_bytes;
int track_uptodate;
spinlock_t lock;
@@ -143,7 +151,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
- atomic_t refs;
+ refcount_t refs;
unsigned state;
struct io_failure_record *failrec;
@@ -201,12 +209,46 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- u64 bytes_changed;
+ unsigned int bytes_changed;
/* Changed ranges */
struct ulist range_changed;
};
+static inline void extent_changeset_init(struct extent_changeset *changeset)
+{
+ changeset->bytes_changed = 0;
+ ulist_init(&changeset->range_changed);
+}
+
+static inline struct extent_changeset *extent_changeset_alloc(void)
+{
+ struct extent_changeset *ret;
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+
+ extent_changeset_init(ret);
+ return ret;
+}
+
+static inline void extent_changeset_release(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ changeset->bytes_changed = 0;
+ ulist_release(&changeset->range_changed);
+}
+
+static inline void extent_changeset_free(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ extent_changeset_release(changeset);
+ kfree(changeset);
+}
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -226,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
u64 start, u64 len,
int create);
-void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping);
+void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
@@ -455,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
u64 delalloc_end, struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags);
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
struct btrfs_fs_info;
struct btrfs_inode;
-int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
- u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct btrfs_inode *inode, u64 start,
- struct page *page, unsigned int pg_offset);
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int mirror_num);
@@ -503,7 +545,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
struct page *page, int pg_offset, int icsum,
bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec);
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 26f9ac719d20..69850155870c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void)
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
- atomic_set(&em->refs, 1);
+ refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
}
@@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(atomic_read(&em->refs) == 0);
- if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(refcount_read(&em->refs) == 0);
+ if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
@@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em,
int modified)
{
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
em->mod_start = em->start;
em->mod_len = em->len;
@@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
if (strict && !(end > em->start && start < extent_map_end(em)))
return NULL;
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index eb8b8fae036b..a67b2def5413 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -2,6 +2,7 @@
#define __EXTENTMAP__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
@@ -41,7 +42,7 @@ struct extent_map {
*/
struct map_lookup *map_lookup;
};
- atomic_t refs;
+ refcount_t refs;
unsigned int compress_type;
struct list_head list;
};
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 64fcb31d7163..fdcb41002623 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
kfree(bio->csum_allocated);
}
-static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 page_bytes_left;
u32 diff;
int nblocks;
- int count = 0, i;
+ int count = 0;
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
@@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
csum_size, GFP_NOFS);
if (!btrfs_bio->csum_allocated) {
btrfs_free_path(path);
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
}
btrfs_bio->csum = btrfs_bio->csum_allocated;
btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
@@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
path->reada = READA_FORWARD;
- WARN_ON(bio->bi_vcnt <= 0);
-
/*
* the free space stuff is only read when it hasn't been
* updated in the current transaction. So, we can safely
@@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (dio)
offset = logical_offset;
- bio_for_each_segment_all(bvec, bio, i) {
- page_bytes_left = bvec->bv_len;
+ bio_for_each_segment(bvec, bio, iter) {
+ page_bytes_left = bvec.bv_len;
if (count)
goto next;
if (!dio)
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
(u32 *)csum, nblocks);
if (count)
@@ -303,12 +302,12 @@ next:
return 0;
}
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
{
return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
{
return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
}
@@ -433,26 +432,26 @@ fail:
return ret;
}
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
char *data;
- struct bio_vec *bvec;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
int index;
int nr_sectors;
- int i, j;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
+ int i;
u64 offset;
- WARN_ON(bio->bi_vcnt <= 0);
sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
GFP_NOFS);
if (!sums)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
- bio_for_each_segment_all(bvec, bio, j) {
+ bio_for_each_segment(bvec, bio, iter) {
if (!contig)
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
}
- data = kmap_atomic(bvec->bv_page);
+ data = kmap_atomic(bvec.bv_page);
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
- bvec->bv_len + fs_info->sectorsize
+ bvec.bv_len + fs_info->sectorsize
- 1);
for (i = 0; i < nr_sectors; i++) {
@@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+ total_bytes;
index = 0;
- data = kmap_atomic(bvec->bv_page);
+ data = kmap_atomic(bvec.bv_page);
}
sums->sums[index] = ~(u32)0;
sums->sums[index]
- = btrfs_csum_data(data + bvec->bv_offset
+ = btrfs_csum_data(data + bvec.bv_offset
+ (i * fs_info->sectorsize),
sums->sums[index],
fs_info->sectorsize);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb7230b2d..9e75d8a39aac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1404,6 +1404,47 @@ fail:
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start,
+ search_len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
@@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ round_up(pos + write_bytes - start_pos,
fs_info->sectorsize) - 1;
- if (start_pos < inode->vfs_inode.i_size) {
+ if (start_pos < inode->vfs_inode.i_size ||
+ (inode->flags & BTRFS_INODE_PREALLOC)) {
struct btrfs_ordered_extent *ordered;
+ unsigned int clear_bits;
+
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
-
+ ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
+ last_pos - start_pos + 1,
+ cached_state);
+ clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
+ if (ret)
+ clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
clear_extent_bit(&inode->io_tree, start_pos,
- last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state, GFP_NOFS);
+ last_pos, clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, cached_state, GFP_NOFS);
+ if (ret)
+ return ret;
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
@@ -1529,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
@@ -1576,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
- ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ extent_changeset_release(data_reserved);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ write_bytes);
if (ret < 0) {
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
@@ -1605,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode, pos,
- write_bytes);
+ btrfs_free_reserved_data_space(inode,
+ data_reserved, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1688,8 +1744,9 @@ again:
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode, __pos,
- release_bytes);
+ btrfs_delalloc_release_space(inode,
+ data_reserved, __pos,
+ release_bytes);
}
}
@@ -1744,12 +1801,13 @@ again:
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes);
} else {
- btrfs_delalloc_release_space(inode,
- round_down(pos, fs_info->sectorsize),
- release_bytes);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ round_down(pos, fs_info->sectorsize),
+ release_bytes);
}
}
+ extent_changeset_free(data_reserved);
return num_written ? num_written : ret;
}
@@ -1824,17 +1882,36 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
ssize_t err;
loff_t pos;
- size_t count;
+ size_t count = iov_iter_count(from);
loff_t oldsize;
int clean_page = 0;
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
inode_unlock(inode);
return err;
}
+ pos = iocb->ki_pos;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /*
+ * We will allocate space in case nodatacow is not set,
+ * so bail
+ */
+ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) ||
+ check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ }
+
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err) {
@@ -1862,8 +1939,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
*/
update_time_for_write(inode);
- pos = iocb->ki_pos;
- count = iov_iter_count(from);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
@@ -1959,7 +2034,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
- int ret = 0;
+ int ret = 0, err;
bool full_sync = 0;
u64 len;
@@ -1978,7 +2053,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret)
- return ret;
+ goto out;
inode_lock(inode);
atomic_inc(&root->log_batch);
@@ -2083,10 +2158,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
* updated and we end up here. So check the inode's mapping
- * flags for any errors that might have happened while doing
- * writeback of file data.
+ * for any errors that might have happened since we last
+ * checked called fsync.
*/
- ret = filemap_check_errors(inode->i_mapping);
+ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
inode_unlock(inode);
goto out;
}
@@ -2175,6 +2250,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_end_transaction(trans);
}
out:
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
return ret > 0 ? -EIO : ret;
}
@@ -2338,17 +2416,15 @@ out:
*/
static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
- return ret;
- }
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ round_down(*start, fs_info->sectorsize),
+ round_up(*len, fs_info->sectorsize), 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
if (em->block_start == EXTENT_MAP_HOLE) {
@@ -2722,6 +2798,7 @@ static long btrfs_fallocate(struct file *file, int mode,
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
struct falloc_range *range;
struct falloc_range *tmp;
struct list_head reserve_list;
@@ -2835,11 +2912,8 @@ static long btrfs_fallocate(struct file *file, int mode,
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
break;
}
last_byte = min(extent_map_end(em), alloc_end);
@@ -2854,18 +2928,20 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, cur_offset,
- last_byte - cur_offset);
- if (ret < 0)
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ cur_offset, last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
break;
+ }
} else {
/*
* Do not need to reserve unwritten extent for this
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, cur_offset,
- last_byte - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -2884,8 +2960,9 @@ static long btrfs_fallocate(struct file *file, int mode,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode, range->start,
- range->len);
+ btrfs_free_reserved_data_space(inode,
+ data_reserved, range->start,
+ range->len);
list_del(&range->list);
kfree(range);
}
@@ -2923,8 +3000,9 @@ out:
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0)
- btrfs_free_reserved_data_space(inode, alloc_start,
- alloc_end - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ alloc_start, alloc_end - cur_offset);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -3025,13 +3103,19 @@ out:
return offset;
}
+static int btrfs_file_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode |= FMODE_AIO_NOWAIT;
+ return generic_file_open(inode, filp);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
- .open = generic_file_open,
+ .open = btrfs_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da6841efac26..c5e6180cdb8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_SIZE;
if (clear)
- memset(io_ctl->cur, 0, PAGE_SIZE);
+ clear_page(io_ctl->cur);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index dd7fb22a955a..a5e34de06c2f 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -17,7 +17,7 @@
*/
#include <linux/kernel.h>
-#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
@@ -153,22 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
static u8 *alloc_bitmap(u32 bitmap_size)
{
- void *mem;
+ u8 *ret;
+ unsigned int nofs_flag;
/*
- * The allocation size varies, observed numbers were < 4K up to 16K.
- * Using vmalloc unconditionally would be too heavy, we'll try
- * contiguous allocations first.
+ * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
+ * into the filesystem as the free space bitmap can be modified in the
+ * critical section of a transaction commit.
+ *
+ * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
+ * know that recursion is unsafe.
*/
- if (bitmap_size <= PAGE_SIZE)
- return kzalloc(bitmap_size, GFP_NOFS);
-
- mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
- if (mem)
- return mem;
-
- return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ nofs_flag = memalloc_nofs_save();
+ ret = kvzalloc(bitmap_size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
}
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
@@ -1189,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- return 0;
+ return btrfs_commit_transaction(trans);
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
@@ -1278,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
free_extent_buffer(free_space_root->commit_root);
kfree(free_space_root);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- return 0;
+ return btrfs_commit_transaction(trans);
abort:
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index a97fdc156a03..baacc1866861 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = tfm;
@@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 5c6c20ec64d8..d02019747d00 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_path *path;
struct inode *inode;
struct btrfs_block_rsv *rsv;
+ struct extent_changeset *data_reserved = NULL;
u64 num_bytes;
u64 alloc_hint = 0;
int ret;
@@ -492,7 +493,7 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
if (ret)
goto out_put;
@@ -516,6 +517,7 @@ out:
trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
+ extent_changeset_free(data_reserved);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e71f1ea3391..06dea7c89bbd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
@@ -115,6 +114,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
u64 ram_bytes, int compress_type,
int type);
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate);
+
+/*
+ * Cleanup all submitted ordered extents in specified range to handle errors
+ * from the fill_dellaloc() callback.
+ *
+ * NOTE: caller must ensure that when an error happens, it can not call
+ * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
+ * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
+ * to be released, which we want to happen only when finishing the ordered
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of the
+ * fill_delalloc() callback already does proper cleanup for the first page of
+ * the range, that is, it invokes the callback writepage_end_io_hook() for the
+ * range of the first page.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+ const u64 offset,
+ const u64 bytes)
+{
+ return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+ bytes - PAGE_SIZE, false);
+}
+
static int btrfs_dirty_inode(struct inode *inode);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -153,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
- int err = 0;
int ret;
size_t cur_size = size;
unsigned long offset;
@@ -175,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
- if (ret) {
- err = ret;
+ if (ret)
goto fail;
- }
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -233,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->disk_i_size = inode->i_size;
ret = btrfs_update_inode(trans, root, inode);
- return ret;
fail:
- return err;
+ return ret;
}
@@ -325,7 +345,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -547,7 +567,7 @@ cont:
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DEFRAG;
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
unsigned long page_error_op;
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
@@ -565,8 +585,10 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- btrfs_free_reserved_data_space_noquota(inode, start,
- end - start + 1);
+ if (ret == 0)
+ btrfs_free_reserved_data_space_noquota(inode,
+ start,
+ end - start + 1);
goto free_pages_out;
}
}
@@ -581,12 +603,11 @@ cont:
/*
* one last check to make sure the compression is really a
- * win, compare the page count read with the blocks on disk
+ * win, compare the page count read with the blocks on disk,
+ * compression must free at least one sector size
*/
total_in = ALIGN(total_in, PAGE_SIZE);
- if (total_compressed >= total_in) {
- will_compress = 0;
- } else {
+ if (total_compressed + blocksize <= total_in) {
num_bytes = total_in;
*num_added += 1;
@@ -815,13 +836,12 @@ retry:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
- ret = btrfs_submit_compressed_write(inode,
+ if (btrfs_submit_compressed_write(inode,
async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
- async_extent->nr_pages);
- if (ret) {
+ async_extent->nr_pages)) {
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
@@ -852,6 +872,7 @@ out_free:
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
@@ -918,10 +939,13 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 disk_num_bytes;
- u64 cur_alloc_size;
+ u64 cur_alloc_size = 0;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
+ unsigned clear_bits;
+ unsigned long page_ops;
+ bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
@@ -944,6 +968,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode, start, end,
delalloc_end, NULL,
EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
@@ -966,14 +991,14 @@ static noinline int cow_file_range(struct inode *inode,
start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
- unsigned long op;
-
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
+ cur_alloc_size = ins.offset;
+ extent_reserved = true;
ram_size = ins.offset;
em = create_io_em(inode, start, ins.offset, /* len */
@@ -988,7 +1013,6 @@ static noinline int cow_file_range(struct inode *inode,
goto out_reserve;
free_extent_map(em);
- cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
if (ret)
@@ -998,15 +1022,24 @@ static noinline int cow_file_range(struct inode *inode,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
+ /*
+ * Only drop cache here, and process as normal.
+ *
+ * We must not allow extent_clear_unlock_delalloc()
+ * at out_unlock label to free meta of this ordered
+ * extent, as its meta should be freed by
+ * btrfs_finish_ordered_io().
+ *
+ * So we must continue until @start is increased to
+ * skip current ordered extent.
+ */
if (ret)
- goto out_drop_extent_cache;
+ btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ start + ram_size - 1, 0);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (disk_num_bytes < cur_alloc_size)
- break;
-
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
@@ -1014,18 +1047,30 @@ static noinline int cow_file_range(struct inode *inode,
* Do set the Private2 bit so we know this page was properly
* setup for writepage
*/
- op = unlock ? PAGE_UNLOCK : 0;
- op |= PAGE_SET_PRIVATE2;
+ page_ops = unlock ? PAGE_UNLOCK : 0;
+ page_ops |= PAGE_SET_PRIVATE2;
extent_clear_unlock_delalloc(inode, start,
start + ram_size - 1,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
- op);
- disk_num_bytes -= cur_alloc_size;
+ page_ops);
+ if (disk_num_bytes < cur_alloc_size)
+ disk_num_bytes = 0;
+ else
+ disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
+ extent_reserved = false;
+
+ /*
+ * btrfs_reloc_clone_csums() error, since start is increased
+ * extent_clear_unlock_delalloc() at out_unlock label won't
+ * free metadata of current ordered extent, we're OK to exit.
+ */
+ if (ret)
+ goto out_unlock;
}
out:
return ret;
@@ -1036,12 +1081,35 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK;
+ /*
+ * If we reserved an extent for our delalloc range (or a subrange) and
+ * failed to create the respective ordered extent, then it means that
+ * when we reserved the extent we decremented the extent's size from
+ * the data space_info's bytes_may_use counter and incremented the
+ * space_info's bytes_reserved counter by the same amount. We must make
+ * sure extent_clear_unlock_delalloc() does not try to decrement again
+ * the data space_info's bytes_may_use counter, therefore we do not pass
+ * it the flag EXTENT_CLEAR_DATA_RESV.
+ */
+ if (extent_reserved) {
+ extent_clear_unlock_delalloc(inode, start,
+ start + cur_alloc_size,
+ start + cur_alloc_size,
+ locked_page,
+ clear_bits,
+ page_ops);
+ start += cur_alloc_size;
+ if (start >= end)
+ goto out;
+ }
extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
locked_page,
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DELALLOC | EXTENT_DEFRAG,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ clear_bits | EXTENT_CLEAR_DATA_RESV,
+ page_ops);
goto out;
}
@@ -1414,15 +1482,14 @@ out_check:
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ BTRFS_DATA_RELOC_TREE_OBJECTID)
+ /*
+ * Error handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler
+ * from freeing metadata of created ordered extent.
+ */
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshoting(root);
- goto error;
- }
- }
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1, end,
@@ -1434,6 +1501,14 @@ out_check:
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error
+ * handler, as metadata for created ordered extent will only
+ * be freed by btrfs_finish_ordered_io().
+ */
+ if (ret)
+ goto error;
if (cur_offset > end)
break;
}
@@ -1487,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
/*
* extent_io.c call back to do delayed allocation processing
*/
-static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+static int run_delalloc_range(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
+ struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
@@ -1509,12 +1585,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
}
+ if (ret)
+ btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
return ret;
}
-static void btrfs_split_extent_hook(struct inode *inode,
+static void btrfs_split_extent_hook(void *private_data,
struct extent_state *orig, u64 split)
{
+ struct inode *inode = private_data;
u64 size;
/* not delalloc, ignore it */
@@ -1549,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode,
* extents, such as when we are doing sequential writes, so we can properly
* account for the metadata space we'll need.
*/
-static void btrfs_merge_extent_hook(struct inode *inode,
+static void btrfs_merge_extent_hook(void *private_data,
struct extent_state *new,
struct extent_state *other)
{
+ struct inode *inode = private_data;
u64 new_size, old_size;
u32 num_extents;
@@ -1652,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static void btrfs_set_bit_hook(struct inode *inode,
+static void btrfs_set_bit_hook(void *private_data,
struct extent_state *state, unsigned *bits)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1682,8 +1763,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
if (btrfs_is_testing(fs_info))
return;
- __percpu_counter_add(&fs_info->delalloc_bytes, len,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
+ fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (*bits & EXTENT_DEFRAG)
@@ -1693,15 +1774,24 @@ static void btrfs_set_bit_hook(struct inode *inode,
btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
+
+ if (!(state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
+ state->start;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
}
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
+static void btrfs_clear_bit_hook(void *private_data,
struct extent_state *state,
unsigned *bits)
{
+ struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
@@ -1722,7 +1812,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
- } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+ } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
spin_lock(&inode->lock);
inode->outstanding_extents -= num_extents;
spin_unlock(&inode->lock);
@@ -1733,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
* don't need to call dellalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_DO_ACCOUNTING &&
+ if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
@@ -1741,16 +1831,15 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list && !(state->state & EXTENT_NORESERVE)
- && (*bits & (EXTENT_DO_ACCOUNTING |
- EXTENT_CLEAR_DATA_RESV)))
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ do_list && !(state->state & EXTENT_NORESERVE) &&
+ (*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(
&inode->vfs_inode,
state->start, len);
- __percpu_counter_add(&fs_info->delalloc_bytes, -len,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
+ fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
if (do_list && inode->delalloc_bytes == 0 &&
@@ -1759,6 +1848,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
btrfs_del_delalloc_inode(root, inode);
spin_unlock(&inode->lock);
}
+
+ if ((state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&inode->lock);
+ ASSERT(inode->new_delalloc_bytes >= len);
+ inode->new_delalloc_bytes -= len;
+ spin_unlock(&inode->lock);
+ }
}
/*
@@ -1802,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
- int ret = 0;
+ struct inode *inode = private_data;
+ blk_status_t ret = 0;
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
@@ -1821,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
+ blk_status_t ret;
ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -1840,14 +1939,15 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read
*/
-static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- int ret = 0;
+ blk_status_t ret = 0;
int skip_sum;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
@@ -1877,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
- bio_flags, bio_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
+ bio_offset, inode,
__btrfs_submit_bio_start,
__btrfs_submit_bio_done);
goto out;
@@ -1892,8 +1992,8 @@ mapit:
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
out:
- if (ret < 0) {
- bio->bi_error = ret;
+ if (ret) {
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -1936,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_writepage_fixup *fixup;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
struct page *page;
struct inode *inode;
u64 page_start;
@@ -1973,7 +2074,7 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, page_start,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
@@ -1993,6 +2094,7 @@ out_page:
unlock_page(page);
put_page(page);
kfree(fixup);
+ extent_changeset_free(data_reserved);
}
/*
@@ -2044,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
+ u64 qg_released;
int extent_inserted = 0;
int ret;
@@ -2099,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
+
/*
* Release the reserved range from inode dirty range map, as it is
* already moved into delayed_ref_head
*/
- btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ if (ret < 0)
+ goto out;
+ qg_released = ret;
+ ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
out:
btrfs_free_path(path);
@@ -2791,6 +2898,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->len;
bool nolock;
bool truncated = false;
+ bool range_locked = false;
+ bool clear_new_delalloc_bytes = false;
+
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ clear_new_delalloc_bytes = true;
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
@@ -2820,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
* space for NOCOW range.
* As NOCOW won't cause a new delayed ref, just free the space
*/
- btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
@@ -2839,13 +2953,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ range_locked = true;
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
&cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 1, cached_state);
+ EXTENT_DEFRAG, 0, cached_state);
if (ret) {
u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
if (0 && last_snapshot >= BTRFS_I(inode)->generation)
@@ -2864,7 +2979,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_unlock;
+ goto out;
}
trans->block_rsv = &fs_info->delalloc_block_rsv;
@@ -2896,7 +3011,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
add_pending_csums(trans, inode, &ordered_extent->list);
@@ -2905,14 +3020,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
ret = 0;
-out_unlock:
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
+ if (range_locked || clear_new_delalloc_bytes) {
+ unsigned int clear_bits = 0;
+
+ if (range_locked)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_new_delalloc_bytes)
+ clear_bits |= EXTENT_DELALLOC_NEW;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1,
+ clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, &cached_state, GFP_NOFS);
+ }
+
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
ordered_extent->len);
@@ -4401,9 +4528,17 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
+
+ trace_btrfs_truncate_show_fi_regular(
+ BTRFS_I(inode), leaf, fi,
+ found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
+
+ trace_btrfs_truncate_show_fi_inline(
+ BTRFS_I(inode), leaf, fi, path->slots[0],
+ found_key.offset);
}
item_end--;
}
@@ -4603,13 +4738,6 @@ error:
btrfs_free_path(path);
- if (err == 0) {
- /* only inline file may have last_size != new_size */
- if (new_size >= fs_info->sectorsize ||
- new_size > fs_info->max_inline)
- ASSERT(last_size == new_size);
- }
-
if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
@@ -4642,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
char *kaddr;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
@@ -4656,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
round_down(from, blocksize), blocksize);
if (ret)
goto out;
@@ -4664,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
round_down(from, blocksize),
blocksize);
ret = -ENOMEM;
@@ -4736,11 +4865,12 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, block_start,
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize);
unlock_page(page);
put_page(page);
out:
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -5135,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state->state & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, start, end - start + 1);
+ btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -5748,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_item *item;
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5799,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
continue;
}
- item = btrfs_item_nr(slot);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid)
@@ -5814,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = found_key.offset;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, di))
+ if (verify_dir_item(fs_info, leaf, slot, di))
goto next;
name_len = btrfs_dir_name_len(leaf, di);
@@ -6735,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*
* This also copies inline extents directly into the page.
*/
-
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page,
size_t pg_offset, u64 start, u64 len,
@@ -6835,11 +6962,18 @@ again:
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
+
+ trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
+ extent_start);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_end = ALIGN(extent_start + size,
fs_info->sectorsize);
+
+ trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
+ path->slots[0],
+ extent_start);
}
next:
if (start >= extent_end) {
@@ -7037,19 +7171,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
if (IS_ERR(em))
return em;
- if (em) {
- /*
- * if our em maps to
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
- }
+ /*
+ * If our em maps to:
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
+ */
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return em;
+ else
+ hole_em = em;
/* check to see if we've wrapped (len == -1 or similar) */
end = start + len;
@@ -7356,11 +7488,11 @@ out:
bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
{
struct radix_tree_root *root = &inode->i_mapping->page_tree;
- int found = false;
+ bool found = false;
void **pagep = NULL;
struct page *page = NULL;
- int start_idx;
- int end_idx;
+ unsigned long start_idx;
+ unsigned long end_idx;
start_idx = start >> PAGE_SHIFT;
@@ -7854,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
bio_end_io_t *repair_endio, void *repair_arg)
{
struct io_failure_record *failrec;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *bio;
int isector;
int read_mode = 0;
+ int segs;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7868,13 +8003,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
failed_mirror);
if (!ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return -EIO;
}
- if ((failed_bio->bi_vcnt > 1)
- || (failed_bio->bi_io_vec->bv_len
- > btrfs_inode_sectorsize(inode)))
+ segs = bio_segments(failed_bio);
+ if (segs > 1 ||
+ (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -7882,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
pgoff, isector, repair_endio, repair_arg);
if (!bio) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return -EIO;
}
bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -7893,7 +8028,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
if (ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
bio_put(bio);
}
@@ -7910,19 +8045,24 @@ struct btrfs_retry_complete {
static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
+ struct inode *inode = done->inode;
struct bio_vec *bvec;
+ struct extent_io_tree *io_tree, *failure_tree;
int i;
- if (bio->bi_error)
+ if (bio->bi_status)
goto end;
ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ io_tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
done->uptodate = 1;
bio_for_each_segment_all(bvec, bio, i)
- clean_io_failure(BTRFS_I(done->inode), done->start,
- bvec->bv_page, 0);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
+ io_tree, done->start, bvec->bv_page,
+ btrfs_ino(BTRFS_I(inode)), 0);
end:
complete(&done->done);
bio_put(bio);
@@ -7932,36 +8072,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
struct btrfs_io_bio *io_bio)
{
struct btrfs_fs_info *fs_info;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_retry_complete done;
u64 start;
unsigned int pgoff;
u32 sectorsize;
int nr_sectors;
- int i;
int ret;
+ int err = 0;
fs_info = BTRFS_I(inode)->root->fs_info;
sectorsize = fs_info->sectorsize;
start = io_bio->logical;
done.inode = inode;
+ io_bio->bio.bi_iter = io_bio->iter;
- bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
- pgoff = bvec->bv_offset;
+ bio_for_each_segment(bvec, &io_bio->bio, iter) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
+ pgoff = bvec.bv_offset;
next_block_or_try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
pgoff, start, start + sectorsize - 1,
io_bio->mirror_num,
btrfs_retry_endio_nocsum, &done);
- if (ret)
- return ret;
+ if (ret) {
+ err = ret;
+ goto next;
+ }
wait_for_completion(&done.done);
@@ -7970,6 +8114,7 @@ next_block_or_try_again:
goto next_block_or_try_again;
}
+next:
start += sectorsize;
nr_sectors--;
@@ -7980,19 +8125,21 @@ next_block_or_try_again:
}
}
- return 0;
+ return err;
}
static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct extent_io_tree *io_tree, *failure_tree;
+ struct inode *inode = done->inode;
struct bio_vec *bvec;
int uptodate;
int ret;
int i;
- if (bio->bi_error)
+ if (bio->bi_status)
goto end;
uptodate = 1;
@@ -8000,13 +8147,19 @@ static void btrfs_retry_endio(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ io_tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
bio_for_each_segment_all(bvec, bio, i) {
- ret = __readpage_endio_check(done->inode, io_bio, i,
- bvec->bv_page, bvec->bv_offset,
- done->start, bvec->bv_len);
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ bvec->bv_offset, done->start,
+ bvec->bv_len);
if (!ret)
- clean_io_failure(BTRFS_I(done->inode), done->start,
- bvec->bv_page, bvec->bv_offset);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, io_tree, done->start,
+ bvec->bv_page,
+ btrfs_ino(BTRFS_I(inode)),
+ bvec->bv_offset);
else
uptodate = 0;
}
@@ -8017,11 +8170,12 @@ end:
bio_put(bio);
}
-static int __btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, int err)
+static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, blk_status_t err)
{
struct btrfs_fs_info *fs_info;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_retry_complete done;
u64 start;
u64 offset = 0;
@@ -8029,7 +8183,7 @@ static int __btrfs_subio_endio_read(struct inode *inode,
int nr_sectors;
unsigned int pgoff;
int csum_pos;
- int i;
+ bool uptodate = (err == 0);
int ret;
fs_info = BTRFS_I(inode)->root->fs_info;
@@ -8038,29 +8192,31 @@ static int __btrfs_subio_endio_read(struct inode *inode,
err = 0;
start = io_bio->logical;
done.inode = inode;
+ io_bio->bio.bi_iter = io_bio->iter;
- bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ bio_for_each_segment(bvec, &io_bio->bio, iter) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec->bv_offset;
+ pgoff = bvec.bv_offset;
next_block:
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
- bvec->bv_page, pgoff, start,
- sectorsize);
- if (likely(!ret))
- goto next;
+ if (uptodate) {
+ csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
+ ret = __readpage_endio_check(inode, io_bio, csum_pos,
+ bvec.bv_page, pgoff, start, sectorsize);
+ if (likely(!ret))
+ goto next;
+ }
try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
pgoff, start, start + sectorsize - 1,
io_bio->mirror_num,
btrfs_retry_endio, &done);
if (ret) {
- err = ret;
+ err = errno_to_blk_status(ret);
goto next;
}
@@ -8087,8 +8243,8 @@ next:
return err;
}
-static int btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, int err)
+static blk_status_t btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, blk_status_t err)
{
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -8108,10 +8264,13 @@ static void btrfs_endio_direct_read(struct bio *bio)
struct inode *inode = dip->inode;
struct bio *dio_bio;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) {
err = btrfs_subio_endio_read(inode, io_bio, err);
+ if (!err)
+ bio->bi_status = 0;
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
@@ -8119,25 +8278,34 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
- dio_bio->bi_error = bio->bi_error;
- dio_end_io(dio_bio, bio->bi_error);
+ dio_bio->bi_status = bio->bi_status;
+ dio_end_io(dio_bio);
if (io_bio->end_io)
- io_bio->end_io(io_bio, err);
+ io_bio->end_io(io_bio, blk_status_to_errno(err));
bio_put(bio);
}
-static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
- const u64 offset,
- const u64 bytes,
- const int uptodate)
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
int ret;
+ if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
+
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
@@ -8146,9 +8314,8 @@ again:
if (!ret)
goto out_test;
- btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
- finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(fs_info->endio_write_workers, &ordered->work);
+ btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -8166,23 +8333,22 @@ static void btrfs_endio_direct_write(struct bio *bio)
struct btrfs_dio_private *dip = bio->bi_private;
struct bio *dio_bio = dip->dio_bio;
- btrfs_endio_direct_write_update_ordered(dip->inode,
- dip->logical_offset,
- dip->bytes,
- !bio->bi_error);
+ __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ dip->bytes, !bio->bi_status);
kfree(dip);
- dio_bio->bi_error = bio->bi_error;
- dio_end_io(dio_bio, bio->bi_error);
+ dio_bio->bi_status = bio->bi_status;
+ dio_end_io(dio_bio);
bio_put(bio);
}
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
+static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 offset)
{
- int ret;
+ struct inode *inode = private_data;
+ blk_status_t ret;
ret = btrfs_csum_one_bio(inode, bio, offset, 1);
BUG_ON(ret); /* -ENOMEM */
return 0;
@@ -8191,7 +8357,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -8221,31 +8387,21 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (dip->errors) {
bio_io_error(dip->orig_bio);
} else {
- dip->dio_bio->bi_error = 0;
+ dip->dio_bio->bi_status = 0;
bio_endio(dip->orig_bio);
}
out:
bio_put(bio);
}
-static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
- u64 first_sector, gfp_t gfp_flags)
-{
- struct bio *bio;
- bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
- if (bio)
- bio_associate_current(bio);
- return bio;
-}
-
-static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
+static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
struct btrfs_dio_private *dip,
struct bio *bio,
u64 file_offset)
{
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- int ret;
+ blk_status_t ret;
/*
* We load all the csum data we need when we submit
@@ -8276,7 +8432,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
bool write = bio_op(bio) == REQ_OP_WRITE;
- int ret;
+ blk_status_t ret;
if (async_submit)
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
@@ -8293,8 +8449,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0,
- file_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
+ file_offset, inode,
__btrfs_submit_bio_start_direct_io,
__btrfs_submit_bio_done);
goto err;
@@ -8324,103 +8480,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
- struct bio_vec *bvec;
u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
- u64 submit_len = 0;
u64 map_length;
- u32 blocksize = fs_info->sectorsize;
int async_submit = 0;
- int nr_sectors;
+ u64 submit_len;
+ int clone_offset = 0;
+ int clone_len;
int ret;
- int i, j;
map_length = orig_bio->bi_iter.bi_size;
+ submit_len = map_length;
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
&map_length, NULL, 0);
if (ret)
return -EIO;
- if (map_length >= orig_bio->bi_iter.bi_size) {
+ if (map_length >= submit_len) {
bio = orig_bio;
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
}
/* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
async_submit = 0;
else
async_submit = 1;
- bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
- if (!bio)
- return -ENOMEM;
-
- bio->bi_opf = orig_bio->bi_opf;
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ /* bio split */
+ ASSERT(map_length <= INT_MAX);
atomic_inc(&dip->pending_bios);
+ do {
+ clone_len = min_t(int, submit_len, map_length);
- bio_for_each_segment_all(bvec, orig_bio, j) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
- i = 0;
-next_block:
- if (unlikely(map_length < submit_len + blocksize ||
- bio_add_page(bio, bvec->bv_page, blocksize,
- bvec->bv_offset + (i * blocksize)) < blocksize)) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the dip might get freed
- * before we're done setting it up
- */
- atomic_inc(&dip->pending_bios);
- ret = __btrfs_submit_dio_bio(bio, inode,
- file_offset, skip_sum,
- async_submit);
- if (ret) {
- bio_put(bio);
- atomic_dec(&dip->pending_bios);
- goto out_err;
- }
-
- start_sector += submit_len >> 9;
- file_offset += submit_len;
+ /*
+ * This will never fail as it's passing GPF_NOFS and
+ * the allocation is backed by btrfs_bioset.
+ */
+ bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
+ clone_len);
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
+
+ ASSERT(submit_len >= clone_len);
+ submit_len -= clone_len;
+ if (submit_len == 0)
+ break;
- submit_len = 0;
+ /*
+ * Increase the count before we submit the bio so we know
+ * the end IO handler won't happen before we increase the
+ * count. Otherwise, the dip might get freed before we're
+ * done setting it up.
+ */
+ atomic_inc(&dip->pending_bios);
- bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
- start_sector, GFP_NOFS);
- if (!bio)
- goto out_err;
- bio->bi_opf = orig_bio->bi_opf;
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
+ async_submit);
+ if (ret) {
+ bio_put(bio);
+ atomic_dec(&dip->pending_bios);
+ goto out_err;
+ }
- map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
- start_sector << 9,
- &map_length, NULL, 0);
- if (ret) {
- bio_put(bio);
- goto out_err;
- }
+ clone_offset += clone_len;
+ start_sector += clone_len >> 9;
+ file_offset += clone_len;
- goto next_block;
- } else {
- submit_len += blocksize;
- if (--nr_sectors) {
- i++;
- goto next_block;
- }
- }
- }
+ map_length = submit_len;
+ ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
+ start_sector << 9, &map_length, NULL, 0);
+ if (ret)
+ goto out_err;
+ } while (submit_len > 0);
submit:
ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
@@ -8447,19 +8583,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
loff_t file_offset)
{
struct btrfs_dio_private *dip = NULL;
- struct bio *io_bio = NULL;
- struct btrfs_io_bio *btrfs_bio;
+ struct bio *bio = NULL;
+ struct btrfs_io_bio *io_bio;
int skip_sum;
bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
int ret = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
- if (!io_bio) {
- ret = -ENOMEM;
- goto free_ordered;
- }
+ bio = btrfs_bio_clone(dio_bio);
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
@@ -8472,17 +8604,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- io_bio->bi_private = dip;
- dip->orig_bio = io_bio;
+ bio->bi_private = dip;
+ dip->orig_bio = bio;
dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
- btrfs_bio = btrfs_io_bio(io_bio);
- btrfs_bio->logical = file_offset;
+ io_bio = btrfs_io_bio(bio);
+ io_bio->logical = file_offset;
if (write) {
- io_bio->bi_end_io = btrfs_endio_direct_write;
+ bio->bi_end_io = btrfs_endio_direct_write;
} else {
- io_bio->bi_end_io = btrfs_endio_direct_read;
+ bio->bi_end_io = btrfs_endio_direct_read;
dip->subio_endio = btrfs_subio_endio_read;
}
@@ -8505,8 +8637,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
if (!ret)
return;
- if (btrfs_bio->end_io)
- btrfs_bio->end_io(btrfs_bio, ret);
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, ret);
free_ordered:
/*
@@ -8518,35 +8650,34 @@ free_ordered:
* same as btrfs_endio_direct_[write|read] because we can't call these
* callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (io_bio && dip) {
- io_bio->bi_error = -EIO;
- bio_endio(io_bio);
+ if (bio && dip) {
+ bio_io_error(bio);
/*
- * The end io callbacks free our dip, do the final put on io_bio
+ * The end io callbacks free our dip, do the final put on bio
* and all the cleanup and final put for dio_bio (through
* dio_end_io()).
*/
dip = NULL;
- io_bio = NULL;
+ bio = NULL;
} else {
if (write)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
file_offset,
dio_bio->bi_iter.bi_size,
- 0);
+ false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- dio_bio->bi_error = -EIO;
+ dio_bio->bi_status = BLK_STS_IOERR;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
* nor bio_endio()/bio_io_error() against dio_bio.
*/
- dio_end_io(dio_bio, ret);
+ dio_end_io(dio_bio);
}
- if (io_bio)
- bio_put(io_bio);
+ if (bio)
+ bio_put(bio);
kfree(dip);
}
@@ -8590,6 +8721,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_data dio_data = { 0 };
+ struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
int flags = 0;
@@ -8625,8 +8757,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
}
- ret = btrfs_delalloc_reserve_space(inode, offset, count);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ offset, count);
if (ret)
goto out;
dio_data.outstanding_extents = count_max_extents(count);
@@ -8658,8 +8794,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, offset,
- dio_data.reserve);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, dio_data.reserve);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -8668,14 +8804,14 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
- 0);
+ false);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, offset,
- count - (size_t)ret);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, count - (size_t)ret);
}
out:
if (wakeup)
@@ -8683,6 +8819,7 @@ out:
if (relock)
inode_lock(inode);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -8819,6 +8956,7 @@ again:
if (!inode_evicting)
clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state,
GFP_NOFS);
@@ -8872,12 +9010,12 @@ again:
* free the entire extent.
*/
if (PageDirty(page))
- btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
@@ -8914,6 +9052,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
char *kaddr;
unsigned long zero_start;
loff_t size;
@@ -8939,7 +9078,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret = btrfs_delalloc_reserve_space(inode, page_start,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
reserved_space);
if (!ret) {
ret = file_update_time(vmf->vma->vm_file);
@@ -8993,8 +9132,8 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, page_start,
- PAGE_SIZE - reserved_space);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ page_start, PAGE_SIZE - reserved_space);
}
}
@@ -9045,13 +9184,16 @@ again:
out_unlock:
if (!ret) {
sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, page_start, reserved_space);
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space);
out_noreserve:
sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -9248,6 +9390,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
@@ -9272,8 +9415,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
- extent_io_tree_init(&ei->io_tree, &inode->i_data);
- extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+ extent_io_tree_init(&ei->io_tree, inode);
+ extent_io_tree_init(&ei->io_failure_tree, inode);
ei->io_tree.track_uptodate = 1;
ei->io_failure_tree.track_uptodate = 1;
atomic_set(&ei->sync_writers, 0);
@@ -9313,6 +9456,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
WARN_ON(BTRFS_I(inode)->defrag_bytes);
@@ -9381,7 +9525,6 @@ void btrfs_destroy_cachep(void)
rcu_barrier();
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
- kmem_cache_destroy(btrfs_transaction_cachep);
kmem_cache_destroy(btrfs_path_cachep);
kmem_cache_destroy(btrfs_free_space_cachep);
}
@@ -9401,12 +9544,6 @@ int btrfs_init_cachep(void)
if (!btrfs_trans_handle_cachep)
goto fail;
- btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
- sizeof(struct btrfs_transaction), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
- if (!btrfs_transaction_cachep)
- goto fail;
-
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_MEM_SPREAD, NULL);
@@ -9431,12 +9568,30 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
u64 delalloc_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
+ u32 bi_flags = BTRFS_I(inode)->flags;
+
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ if (bi_flags & BTRFS_INODE_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (bi_flags & BTRFS_INODE_COMPRESS)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (bi_flags & BTRFS_INODE_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (bi_flags & BTRFS_INODE_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
- delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
@@ -10405,7 +10560,7 @@ next:
btrfs_end_transaction(trans);
}
if (cur_offset < end)
- btrfs_free_reserved_data_space(inode, cur_offset,
+ btrfs_free_reserved_data_space(inode, NULL, cur_offset,
end - cur_offset + 1);
return ret;
}
@@ -10526,6 +10681,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
return -EAGAIN;
}
+static struct btrfs_fs_info *iotree_fs_info(void *private_data)
+{
+ struct inode *inode = private_data;
+ return btrfs_sb(inode->i_sb);
+}
+
+static void btrfs_check_extent_io_range(void *private_data, const char *caller,
+ u64 start, u64 end)
+{
+ struct inode *inode = private_data;
+ u64 isize;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
+{
+ struct inode *inode = private_data;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ ASSERT(page); /* Pages should be in the extent_io_tree */
+ set_page_writeback(page);
+ put_page(page);
+ index++;
+ }
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10569,6 +10760,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+ .tree_fs_info = iotree_fs_info,
+ .set_range_writeback = btrfs_set_range_writeback,
/* optional callbacks */
.fill_delalloc = run_delalloc_range,
@@ -10578,6 +10771,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
+ .check_extent_io_range = btrfs_check_extent_io_range,
};
/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dabfc7ac48a6..fa1b78cf25f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -37,7 +37,7 @@
#include <linux/bit_spinlock.h>
#include <linux/security.h>
#include <linux/xattr.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
@@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
- btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_io_tree *tree;
+ struct extent_changeset *data_reserved = NULL;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
file_end = (isize - 1) >> PAGE_SHIFT;
@@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- ret = btrfs_delalloc_reserve_space(inode,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
@@ -1226,7 +1227,7 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT);
}
@@ -1247,15 +1248,17 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
+ extent_changeset_free(data_reserved);
return i_done;
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -1504,7 +1507,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1619,7 +1622,7 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
}
@@ -2661,7 +2664,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
mutex_lock(&fs_info->volume_mutex);
@@ -2680,7 +2683,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
@@ -2708,7 +2711,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
return -EOPNOTSUPP;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -2721,7 +2724,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -2752,7 +2755,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -2772,7 +2775,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out_drop_write:
mnt_drop_write_file(file);
@@ -3539,12 +3542,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
- if (!buf) {
- buf = vmalloc(fs_info->nodesize);
- if (!buf)
- return ret;
- }
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
path = btrfs_alloc_path();
if (!path) {
@@ -4442,13 +4442,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- atomic_set(
- &fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4593,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
out:
btrfs_free_path(path);
- vfree(inodes);
+ kvfree(inodes);
kfree(loi);
return ret;
@@ -4643,7 +4641,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
@@ -4689,7 +4687,7 @@ again:
}
locked:
- BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+ BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4745,11 +4743,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and mutually_exclusive_operation_running
+ * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
* goes to to btrfs_balance. bctl is freed in __cancel_balance,
* or, if restriper was paused all the way until unmount, in
- * free_fs_info. mutually_exclusive_operation_running is
- * cleared in __cancel_balance.
+ * free_fs_info. The flag is cleared in __cancel_balance.
*/
need_unlock = false;
@@ -4769,7 +4766,7 @@ out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
mnt_drop_write_file(file);
return ret;
@@ -4903,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
goto out;
}
- /* FIXME: check if the IDs really exist */
if (sa->assign) {
ret = btrfs_add_qgroup_relation(trans, fs_info,
sa->src, sa->dst);
@@ -4962,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- /* FIXME: check if the IDs really exist */
if (sa->create) {
ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
} else {
@@ -5016,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
qgroupid = root->root_key.objectid;
}
- /* FIXME: check if the IDs really exist */
ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
err = btrfs_end_transaction(trans);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index f48c8c14dc14..d433e75d489a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -18,13 +18,14 @@
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/lzo.h>
+#include <linux/refcount.h>
#include "compression.h"
#define LZO_LEN 4
@@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->buf);
- vfree(workspace->cbuf);
- vfree(workspace->mem);
+ kvfree(workspace->buf);
+ kvfree(workspace->cbuf);
+ kvfree(workspace->mem);
kfree(workspace);
}
@@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
- workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
- workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
- workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+ workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+ workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws,
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
if (ret != LZO_E_OK) {
- pr_debug("BTRFS: deflate in loop returned %d\n",
+ pr_debug("BTRFS: lzo in loop returned %d\n",
ret);
ret = -EIO;
goto out;
@@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws,
in_len = min(bytes_left, PAGE_SIZE);
}
- if (tot_out > tot_in)
+ if (tot_out >= tot_in) {
+ ret = -E2BIG;
goto out;
+ }
/* store the size of all chunks of compressed data */
cpage_out = kmap(pages[0]);
@@ -254,16 +257,13 @@ out:
return ret;
}
-static int lzo_decompress_bio(struct list_head *ws,
- struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen)
+static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
char *data_in;
unsigned long page_in_index = 0;
+ size_t srclen = cb->compressed_len;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
@@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws,
unsigned long tot_len;
char *buf;
bool may_late_unmap, need_unmap;
+ struct page **pages_in = cb->compressed_pages;
+ u64 disk_start = cb->start;
+ struct bio *orig_bio = cb->orig_bio;
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9a46878ba60f..a3aca495e33e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
/* one ref for the tree */
- atomic_set(&entry->refs, 1);
+ refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
@@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -425,7 +425,7 @@ have_entry:
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode,
if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
list_add(&ordered->log_list, logged_list);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
@@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
- if (atomic_dec_and_test(&entry->refs)) {
+ if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->log_list));
ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
@@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ASSERT(trans);
@@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
LIST_HEAD(skipped);
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
- int count = 0;
+ u64 count = 0;
const u64 range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
@@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
@@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
cond_resched();
spin_lock(&root->ordered_extent_lock);
- if (nr != -1)
+ if (nr != U64_MAX)
nr--;
count++;
}
@@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
return count;
}
-int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
- const u64 range_start, const u64 range_len)
+u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
- int done;
- int total_done = 0;
+ u64 total_done = 0;
+ u64 done;
INIT_LIST_HEAD(&splice);
@@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
total_done += done;
spin_lock(&fs_info->ordered_root_lock);
- if (nr != -1) {
+ if (nr != U64_MAX) {
nr -= done;
- WARN_ON(nr < 0);
}
}
list_splice_tail(&splice, &fs_info->ordered_roots);
@@ -870,7 +869,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (!offset_in_entry(entry, file_offset))
entry = NULL;
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -911,7 +910,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
}
out:
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -948,7 +947,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 195c93b67fe0..56c4c0ee6381 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -113,7 +113,7 @@ struct btrfs_ordered_extent {
int compress_type;
/* reference count */
- atomic_t refs;
+ refcount_t refs;
/* the inode we belong to */
struct inode *inode;
@@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
-int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_get_logged_extents(struct btrfs_inode *inode,
struct list_head *logged_list,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index cdafbf92ef0c..fcae61e175f3 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
struct btrfs_block_group_item);
- pr_info("\t\tblock group used %llu\n",
- btrfs_disk_block_group_used(l, bi));
+ pr_info(
+ "\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
+ btrfs_disk_block_group_used(l, bi),
+ btrfs_disk_block_group_chunk_objectid(l, bi),
+ btrfs_disk_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index d6cb155ef7a1..4b23ae5d0e5c 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root,
size_t),
void *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *name_buf = NULL;
char *value_buf = NULL;
@@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root,
name_ptr = (unsigned long)(di + 1);
data_ptr = name_ptr + name_len;
+ if (verify_dir_item(fs_info, leaf,
+ path->slots[0], di)) {
+ ret = -EIO;
+ goto out;
+ }
+
if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
name_ptr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index afbea61d957e..4ce351efe281 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,50 +47,6 @@
* - check all ioctl parameters
*/
-/*
- * one struct for each qgroup, organized in fs_info->qgroup_tree.
- */
-struct btrfs_qgroup {
- u64 qgroupid;
-
- /*
- * state
- */
- u64 rfer; /* referenced */
- u64 rfer_cmpr; /* referenced compressed */
- u64 excl; /* exclusive */
- u64 excl_cmpr; /* exclusive compressed */
-
- /*
- * limits
- */
- u64 lim_flags; /* which limits are set */
- u64 max_rfer;
- u64 max_excl;
- u64 rsv_rfer;
- u64 rsv_excl;
-
- /*
- * reservation tracking
- */
- u64 reserved;
-
- /*
- * lists
- */
- struct list_head groups; /* groups this group is member of */
- struct list_head members; /* groups that are members of this group */
- struct list_head dirty; /* dirty groups */
- struct rb_node node; /* tree of qgroups */
-
- /*
- * temp variables for accounting operations
- * Refer to qgroup_shared_accounting() for details.
- */
- u64 old_refcnt;
- u64 new_refcnt;
-};
-
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
@@ -1078,6 +1034,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0) {
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup, num_bytes);
else
@@ -1103,6 +1060,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0) {
+ trace_qgroup_update_reserve(fs_info, qgroup,
+ -(s64)num_bytes);
if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup,
num_bytes);
@@ -1447,38 +1406,6 @@ out:
return ret;
}
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_extent_record *record;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
- u64 qgroup_to_skip;
- int ret = 0;
-
- delayed_refs = &trans->transaction->delayed_refs;
- qgroup_to_skip = delayed_refs->qgroup_to_skip;
-
- /*
- * No need to do lock, since this function will only be called in
- * btrfs_commit_transaction().
- */
- node = rb_first(&delayed_refs->dirty_extent_root);
- while (node) {
- record = rb_entry(node, struct btrfs_qgroup_extent_record,
- node);
- if (WARN_ON(!record->old_roots))
- ret = btrfs_find_all_roots(NULL, fs_info,
- record->bytenr, 0, &record->old_roots);
- if (ret < 0)
- break;
- if (qgroup_to_skip)
- ulist_del(record->old_roots, qgroup_to_skip, 0);
- node = rb_next(node);
- }
- return ret;
-}
-
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
@@ -1600,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
+ cond_resched();
return 0;
}
@@ -1959,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Check if the @roots potentially is a list of fs tree roots
+ *
+ * Return 0 for definitely not a fs/subvol tree roots ulist
+ * Return 1 for possible fs/subvol tree roots in the list (considering an empty
+ * one as well)
+ */
+static int maybe_fs_roots(struct ulist *roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ /* Empty one, still possible for fs roots */
+ if (!roots || roots->nnodes == 0)
+ return 1;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter);
+ if (!unode)
+ return 1;
+
+ /*
+ * If it contains fs tree roots, then it must belong to fs/subvol
+ * trees.
+ * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
+ */
+ return is_fstree(unode->val);
+}
+
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -1975,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- if (new_roots)
+ if (new_roots) {
+ if (!maybe_fs_roots(new_roots))
+ goto out_free;
nr_new_roots = new_roots->nnodes;
- if (old_roots)
+ }
+ if (old_roots) {
+ if (!maybe_fs_roots(old_roots))
+ goto out_free;
nr_old_roots = old_roots->nnodes;
+ }
+
+ /* Quick exit, either not fs tree roots, or won't affect any qgroup */
+ if (nr_old_roots == 0 && nr_new_roots == 0)
+ goto out_free;
BUG_ON(!fs_info->quota_root);
@@ -2058,16 +2025,32 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
if (!ret) {
/*
- * Use (u64)-1 as time_seq to do special search, which
+ * Old roots should be searched when inserting qgroup
+ * extent record
+ */
+ if (WARN_ON(!record->old_roots)) {
+ /* Search commit root to find old_roots */
+ ret = btrfs_find_all_roots(NULL, fs_info,
+ record->bytenr, 0,
+ &record->old_roots);
+ if (ret < 0)
+ goto cleanup;
+ }
+
+ /*
+ * Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
* root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, (u64)-1, &new_roots);
+ record->bytenr, SEQ_LAST, &new_roots);
if (ret < 0)
goto cleanup;
- if (qgroup_to_skip)
+ if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
+ ulist_del(record->old_roots, qgroup_to_skip,
+ 0);
+ }
ret = btrfs_qgroup_account_extent(trans, fs_info,
record->bytenr, record->num_bytes,
record->old_roots, new_roots);
@@ -2370,6 +2353,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
+ int retried = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2379,6 +2363,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
if (num_bytes == 0)
return 0;
+ if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
+ capable(CAP_SYS_RESOURCE))
+ enforce = false;
+
+retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -2405,6 +2394,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ /*
+ * Commit the tree and retry, since we may have
+ * deletions which would free up space.
+ */
+ if (!retried && qg->reserved > 0) {
+ struct btrfs_trans_handle *trans;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+ retried++;
+ goto retry;
+ }
ret = -EDQUOT;
goto out;
}
@@ -2427,6 +2437,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
+ trace_qgroup_update_reserve(fs_info, qg, num_bytes);
qg->reserved += num_bytes;
}
@@ -2472,6 +2483,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
+ trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
if (qg->reserved < num_bytes)
report_reserved_underflow(fs_info, qg, num_bytes);
else
@@ -2490,18 +2502,6 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
-{
- if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
- return;
- btrfs_err(trans->fs_info,
- "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x",
- trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
-}
-
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
@@ -2835,70 +2835,146 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
* Return <0 for error (including -EQUOT)
*
* NOTE: this function may sleep for memory allocation.
+ * if btrfs_qgroup_reserve_data() is called multiple times with
+ * same @reserved, caller must ensure when error happens it's OK
+ * to free *ALL* reserved space.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_reserve_data(struct inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_changeset changeset;
struct ulist_node *unode;
struct ulist_iterator uiter;
+ struct extent_changeset *reserved;
+ u64 orig_reserved;
+ u64 to_reserve;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
!is_fstree(root->objectid) || len == 0)
return 0;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ /* @reserved parameter is mandatory for qgroup */
+ if (WARN_ON(!reserved_ret))
+ return -EINVAL;
+ if (!*reserved_ret) {
+ *reserved_ret = extent_changeset_alloc();
+ if (!*reserved_ret)
+ return -ENOMEM;
+ }
+ reserved = *reserved_ret;
+ /* Record already reserved space */
+ orig_reserved = reserved->bytes_changed;
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+
+ /* Newly reserved space */
+ to_reserve = reserved->bytes_changed - orig_reserved;
trace_btrfs_qgroup_reserve_data(inode, start, len,
- changeset.bytes_changed,
- QGROUP_RESERVE);
+ to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto cleanup;
- ret = qgroup_reserve(root, changeset.bytes_changed, true);
+ ret = qgroup_reserve(root, to_reserve, true);
if (ret < 0)
goto cleanup;
- ulist_release(&changeset.range_changed);
return ret;
cleanup:
- /* cleanup already reserved ranges */
+ /* cleanup *ALL* already reserved ranges */
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&changeset.range_changed, &uiter)))
+ while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
GFP_NOFS);
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(reserved);
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
- int free)
+/* Free ranges specified by @reserved, normally in error path */
+static int qgroup_free_reserved_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct extent_changeset changeset;
+ int freed = 0;
+ int ret;
+
+ extent_changeset_init(&changeset);
+ len = round_up(start + len, root->fs_info->sectorsize);
+ start = round_down(start, root->fs_info->sectorsize);
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
+ u64 range_start = unode->val;
+ /* unode->aux is the inclusive end */
+ u64 range_len = unode->aux - range_start + 1;
+ u64 free_start;
+ u64 free_len;
+
+ extent_changeset_release(&changeset);
+
+ /* Only free range in range [start, start + len) */
+ if (range_start >= start + len ||
+ range_start + range_len <= start)
+ continue;
+ free_start = max(range_start, start);
+ free_len = min(start + len, range_start + range_len) -
+ free_start;
+ /*
+ * TODO: To also modify reserved->ranges_reserved to reflect
+ * the modification.
+ *
+ * However as long as we free qgroup reserved according to
+ * EXTENT_QGROUP_RESERVED, we won't double free.
+ * So not need to rush.
+ */
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ free_start, free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ if (ret < 0)
+ goto out;
+ freed += changeset.bytes_changed;
+ }
+ btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+ ret = freed;
+out:
+ extent_changeset_release(&changeset);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len,
+ int free)
{
struct extent_changeset changeset;
int trace_op = QGROUP_RELEASE;
int ret;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ /* In release case, we shouldn't have @reserved */
+ WARN_ON(!free && reserved);
+ if (free && reserved)
+ return qgroup_free_reserved_data(inode, reserved, start, len);
+ extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
- if (free) {
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ if (free)
trace_op = QGROUP_FREE;
- }
trace_btrfs_qgroup_release_data(inode, start, len,
changeset.bytes_changed, trace_op);
+ if (free)
+ btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
+ BTRFS_I(inode)->root->objectid,
+ changeset.bytes_changed);
+ ret = changeset.bytes_changed;
out:
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(&changeset);
return ret;
}
@@ -2907,14 +2983,17 @@ out:
*
* Should be called when a range of pages get invalidated before reaching disk.
* Or for error cleanup case.
+ * if @reserved is given, only reserved range in [@start, @start + @len) will
+ * be freed.
*
* For data written to disk, use btrfs_qgroup_release_data().
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_free_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
- return __btrfs_qgroup_release_data(inode, start, len, 1);
+ return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
}
/*
@@ -2934,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
*/
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
{
- return __btrfs_qgroup_release_data(inode, start, len, 0);
+ return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -2948,6 +3027,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+ trace_qgroup_meta_reserve(root, (s64)num_bytes);
ret = qgroup_reserve(root, num_bytes, enforce);
if (ret < 0)
return ret;
@@ -2967,6 +3047,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
if (reserved == 0)
return;
+ trace_qgroup_meta_reserve(root, -(s64)reserved);
btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
}
@@ -2981,6 +3062,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes);
btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
}
@@ -2995,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
struct ulist_iterator iter;
int ret;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
@@ -3013,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
changeset.bytes_changed);
}
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(&changeset);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 26932a8a1993..d9984e87cddf 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record {
};
/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ u64 reserved;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ * Refer to qgroup_shared_accounting() for details.
+ */
+ u64 old_refcnt;
+ u64 new_refcnt;
+};
+
+/*
* For qgroup event trace points only
*/
#define QGROUP_RESERVE (1<<0)
@@ -90,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+
/*
* Inform qgroup to trace one dirty extent, its info is recorded in @record.
* So qgroup can account it at transaction committing time.
@@ -186,17 +229,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes);
-/*
- * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
- * called by everywhere, can't provide good trace for delayed ref case.
- */
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -204,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_reserve_data(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
bool enforce);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1571bf26dc07..6f845d219cd6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -31,7 +31,7 @@
#include <linux/hash.h>
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -149,7 +149,7 @@ struct btrfs_raid_bio {
int generic_bio_cnt;
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
@@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
* of a failing mount.
*/
table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- if (!table) {
- table = vzalloc(table_size);
- if (!table)
- return -ENOMEM;
- }
+ table = kvzalloc(table_size, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
spin_lock_init(&table->cache_lock);
INIT_LIST_HEAD(&table->stripe_cache);
@@ -389,7 +386,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (bio_list_empty(&rbio->bio_list)) {
if (!list_empty(&rbio->hash_list)) {
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
BUG_ON(!list_empty(&rbio->plug_list));
}
}
@@ -480,7 +477,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
/* bump our ref if we were not in the list before */
if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
if (!list_empty(&rbio->stripe_cache)){
list_move(&rbio->stripe_cache, &table->stripe_cache);
@@ -689,7 +686,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
test_bit(RBIO_CACHE_BIT, &cur->flags) &&
!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
list_del_init(&cur->hash_list);
- atomic_dec(&cur->refs);
+ refcount_dec(&cur->refs);
steal_rbio(cur, rbio);
cache_drop = cur;
@@ -738,7 +735,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
}
}
lockit:
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
spin_unlock_irqrestore(&h->lock, flags);
@@ -784,7 +781,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
/*
* we use the plug list to hold all the rbios
@@ -801,7 +798,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
list_del_init(&rbio->plug_list);
list_add(&next->hash_list, &h->hash_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
@@ -843,8 +840,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
- WARN_ON(atomic_read(&rbio->refs) < 0);
- if (!atomic_dec_and_test(&rbio->refs))
+ if (!refcount_dec_and_test(&rbio->refs))
return;
WARN_ON(!list_empty(&rbio->stripe_cache));
@@ -872,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *next;
@@ -885,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- cur->bi_error = err;
+ cur->bi_status = err;
bio_endio(cur);
cur = next;
}
@@ -898,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
int max_errors;
if (err)
@@ -915,7 +911,7 @@ static void raid_write_end_io(struct bio *bio)
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
0 : rbio->bbio->max_errors;
if (atomic_read(&rbio->error) > max_errors)
- err = -EIO;
+ err = BLK_STS_IOERR;
rbio_orig_end_io(rbio, err);
}
@@ -997,7 +993,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
- atomic_set(&rbio->refs, 1);
+ refcount_set(&rbio->refs, 1);
atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
@@ -1093,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_error &&
+ !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
@@ -1102,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
- if (!bio)
- return -ENOMEM;
-
+ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
bio->bi_iter.bi_size = 0;
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1449,7 +1442,7 @@ static void raid_rmw_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -1992,7 +1985,7 @@ static void raid_recover_end_io(struct bio *bio)
* we only read stripe pages off the disk, set them
* up to date if there were no errors
*/
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2118,6 +2111,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_raid_bio *rbio;
int ret;
+ if (generic_io) {
+ ASSERT(bbio->mirror_num == mirror_num);
+ btrfs_io_bio(bio)->mirror_num = mirror_num;
+ }
+
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
@@ -2194,6 +2192,8 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
+ * Caller must have already increased bio_counter for getting @bbio.
+ *
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
@@ -2231,6 +2231,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+ /*
+ * We have already increased bio_counter when getting bbio, record it
+ * so we can free it at rbio_orig_end_io().
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
@@ -2518,7 +2524,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2673,6 +2679,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
return NULL;
}
+ /*
+ * When we get bbio, we have already increased bio_counter, record it
+ * so we can free it at rbio_orig_end_io()
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index e88bca87f5d2..ab852b8e3e37 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
struct btrfs_key top;
- int err;
struct list_head extctl;
int refcnt;
spinlock_t lock;
@@ -209,9 +208,9 @@ cleanup:
return;
}
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err)
+int btree_readahead_hook(struct extent_buffer *eb, int err)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
struct reada_extent *re;
@@ -235,10 +234,10 @@ start_machine:
return ret;
}
-static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev, u64 logical,
+static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
struct btrfs_block_group_cache *cache = NULL;
@@ -270,6 +269,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
if (!zone)
return NULL;
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret) {
+ kfree(zone);
+ return NULL;
+ }
+
zone->start = start;
zone->end = end;
INIT_LIST_HEAD(&zone->list);
@@ -299,6 +304,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
zone = NULL;
}
spin_unlock(&fs_info->reada_lock);
+ radix_tree_preload_end();
return zone;
}
@@ -313,7 +319,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
- u32 blocksize;
u64 length;
int real_stripes;
int nzones = 0;
@@ -334,7 +339,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!re)
return NULL;
- blocksize = fs_info->nodesize;
re->logical = logical;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
@@ -344,10 +348,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
/*
* map block
*/
- length = blocksize;
+ length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&length, &bbio, 0);
- if (ret || !bbio || length < blocksize)
+ if (ret || !bbio || length < fs_info->nodesize)
goto error;
if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
@@ -367,7 +371,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!dev->bdev)
continue;
- zone = reada_find_zone(fs_info, dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bbio);
if (!zone)
continue;
@@ -386,6 +390,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
}
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret)
+ goto error;
+
/* insert extent in reada_tree + all per-device trees, all or nothing */
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
spin_lock(&fs_info->reada_lock);
@@ -395,13 +403,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
+ radix_tree_preload_end();
prev_dev = NULL;
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
&fs_info->dev_replace);
@@ -639,9 +650,9 @@ static int reada_pick_zone(struct btrfs_device *dev)
return 1;
}
-static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev)
+static int reada_start_machine_dev(struct btrfs_device *dev)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
struct reada_extent *re = NULL;
int mirror_num = 0;
struct extent_buffer *eb = NULL;
@@ -754,8 +765,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(fs_info,
- device);
+ enqueued += reada_start_machine_dev(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d60df51959f7..65661d1aae4e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset;
+ struct extent_changeset *data_reserved = NULL;
BUG_ON(cluster->start != cluster->boundary[0]);
inode_lock(inode);
- ret = btrfs_check_data_free_space(inode, prealloc_start,
+ ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
prealloc_end + 1 - prealloc_start);
if (ret)
goto out;
@@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, cur_offset,
- start - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, start - cur_offset);
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
@@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode,
nr++;
}
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, cur_offset,
- prealloc_end + 1 - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, prealloc_end + 1 - cur_offset);
out:
inode_unlock(inode);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks,
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&rc->processed_blocks, NULL);
return rc;
}
@@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
- btrfs_wait_ordered_roots(fs_info, -1,
+ btrfs_wait_ordered_roots(fs_info, U64_MAX,
rc->block_group->key.objectid,
rc->block_group->key.offset);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a08224eab8b4..460db0cb2d07 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -390,6 +390,13 @@ again:
WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
+ ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
+ name_len);
+ if (!ret) {
+ err = -EIO;
+ goto out;
+ }
+
WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
*sequence = btrfs_root_ref_sequence(leaf, ref);
@@ -501,8 +508,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
- struct timespec ct = current_fs_time(root->fs_info->sb);
+ struct timespec ct;
+ ktime_get_real_ts(&ct);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b0251eb1239f..6f1e4c984b94 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -18,6 +18,7 @@
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
@@ -64,7 +65,7 @@ struct scrub_ctx;
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
- atomic_t refs;
+ refcount_t refs;
struct btrfs_bio *bbio;
u64 map_length;
};
@@ -95,7 +96,7 @@ struct scrub_bio {
struct scrub_ctx *sctx;
struct btrfs_device *dev;
struct bio *bio;
- int err;
+ blk_status_t status;
u64 logical;
u64 physical;
#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
@@ -112,7 +113,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t refs; /* free mem on transition to zero */
+ refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -140,9 +141,9 @@ struct scrub_parity {
int nsectors;
- int stripe_len;
+ u64 stripe_len;
- atomic_t refs;
+ refcount_t refs;
struct list_head spages;
@@ -161,14 +162,6 @@ struct scrub_parity {
unsigned long bitmap[0];
};
-struct scrub_wr_ctx {
- struct scrub_bio *wr_curr_bio;
- struct btrfs_device *tgtdev;
- int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
- atomic_t flush_all_writes;
- struct mutex wr_lock;
-};
-
struct scrub_ctx {
struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
struct btrfs_fs_info *fs_info;
@@ -183,11 +176,14 @@ struct scrub_ctx {
atomic_t cancel_req;
int readonly;
int pages_per_rd_bio;
- u32 sectorsize;
- u32 nodesize;
int is_dev_replace;
- struct scrub_wr_ctx wr_ctx;
+
+ struct scrub_bio *wr_curr_bio;
+ struct mutex wr_lock;
+ int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+ atomic_t flush_all_writes;
+ struct btrfs_device *wr_tgtdev;
/*
* statistics
@@ -202,7 +198,7 @@ struct scrub_ctx {
* doesn't free the scrub context before or while the workers are
* doing the wakeup() call.
*/
- atomic_t refs;
+ refcount_t refs;
};
struct scrub_fixup_nodatasum {
@@ -240,6 +236,13 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct full_stripe_lock {
+ struct rb_node node;
+ u64 logical;
+ u64 refs;
+ struct mutex mutex;
+};
+
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -282,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
-static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
- struct btrfs_device *dev,
- int is_dev_replace);
-static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
@@ -305,7 +304,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -349,6 +348,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct full_stripe_lock *entry;
+ struct full_stripe_lock *ret;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ p = &locks_root->root.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical) {
+ p = &(*p)->rb_left;
+ } else if (fstripe_logical > entry->logical) {
+ p = &(*p)->rb_right;
+ } else {
+ entry->refs++;
+ return entry;
+ }
+ }
+
+ /* Insert new lock */
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ ret->logical = fstripe_logical;
+ ret->refs = 1;
+ mutex_init(&ret->mutex);
+
+ rb_link_node(&ret->node, parent, p);
+ rb_insert_color(&ret->node, &locks_root->root);
+ return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node *node;
+ struct full_stripe_lock *entry;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ node = locks_root->root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical)
+ node = node->rb_left;
+ else if (fstripe_logical > entry->logical)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
+ u64 bytenr)
+{
+ u64 ret;
+
+ /*
+ * Due to chunk item size limit, full stripe length should not be
+ * larger than U32_MAX. Just a sanity check here.
+ */
+ WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+ /*
+ * round_down() can only handle power of 2, while RAID56 full
+ * stripe length can be 64KiB * n, so we need to manually round down.
+ */
+ ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->key.objectid;
+ return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool *locked_ret)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *existing;
+ u64 fstripe_start;
+ int ret = 0;
+
+ *locked_ret = false;
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+
+ /* Profiles not based on parity don't need full stripe lock */
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+ locks_root = &bg_cache->full_stripe_locks_root;
+
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ /* Now insert the full stripe lock */
+ mutex_lock(&locks_root->lock);
+ existing = insert_full_stripe_lock(locks_root, fstripe_start);
+ mutex_unlock(&locks_root->lock);
+ if (IS_ERR(existing)) {
+ ret = PTR_ERR(existing);
+ goto out;
+ }
+ mutex_lock(&existing->mutex);
+ *locked_ret = true;
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool locked)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *fstripe_lock;
+ u64 fstripe_start;
+ bool freeit = false;
+ int ret = 0;
+
+ /* If we didn't acquire full stripe lock, no need to continue */
+ if (!locked)
+ return 0;
+
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+
+ locks_root = &bg_cache->full_stripe_locks_root;
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ mutex_lock(&locks_root->lock);
+ fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+ /* Unpaired unlock_full_stripe() detected */
+ if (!fstripe_lock) {
+ WARN_ON(1);
+ ret = -ENOENT;
+ mutex_unlock(&locks_root->lock);
+ goto out;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ WARN_ON(1);
+ btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+ fstripe_lock->logical);
+ } else {
+ fstripe_lock->refs--;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ rb_erase(&fstripe_lock->node, &locks_root->root);
+ freeit = true;
+ }
+ mutex_unlock(&locks_root->lock);
+
+ mutex_unlock(&fstripe_lock->mutex);
+ if (freeit)
+ kfree(fstripe_lock);
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
*/
@@ -356,7 +571,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -420,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- scrub_free_wr_ctx(&sctx->wr_ctx);
-
/* this can happen when scrub is cancelled */
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
@@ -441,13 +654,14 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
kfree(sbio);
}
+ kfree(sctx->wr_curr_bio);
scrub_free_csums(sctx);
kfree(sctx);
}
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
- if (atomic_dec_and_test(&sctx->refs))
+ if (refcount_dec_and_test(&sctx->refs))
scrub_free_ctx(sctx);
}
@@ -457,12 +671,11 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->fs_info;
- int ret;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
- atomic_set(&sctx->refs, 1);
+ refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
@@ -487,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->bios[i]->next_free = -1;
}
sctx->first_free = 0;
- sctx->nodesize = fs_info->nodesize;
- sctx->sectorsize = fs_info->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
@@ -499,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
- ret = scrub_setup_wr_ctx(&sctx->wr_ctx,
- fs_info->dev_replace.tgtdev, is_dev_replace);
- if (ret) {
- scrub_free_ctx(sctx);
- return ERR_PTR(ret);
+ WARN_ON(sctx->wr_curr_bio != NULL);
+ mutex_init(&sctx->wr_lock);
+ sctx->wr_curr_bio = NULL;
+ if (is_dev_replace) {
+ WARN_ON(!fs_info->dev_replace.tgtdev);
+ sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
+ sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
+ atomic_set(&sctx->flush_all_writes, 0);
}
+
return sctx;
nomem:
@@ -519,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
u32 nlink;
int ret;
int i;
+ unsigned nofs_flag;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
@@ -557,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
+ /*
+ * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
+ * uses GFP_NOFS in this context, so we keep it consistent but it does
+ * not seem to be strictly necessary.
+ */
+ nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, swarn->path);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
@@ -731,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
ret = -EIO;
goto out;
}
- ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE,
+ ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
fixup->logical, page,
offset - page_offset(page),
fixup->mirror_num);
@@ -857,12 +1080,14 @@ out:
static inline void scrub_get_recover(struct scrub_recover *recover)
{
- atomic_inc(&recover->refs);
+ refcount_inc(&recover->refs);
}
-static inline void scrub_put_recover(struct scrub_recover *recover)
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+ struct scrub_recover *recover)
{
- if (atomic_dec_and_test(&recover->refs)) {
+ if (refcount_dec_and_test(&recover->refs)) {
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(recover->bbio);
kfree(recover);
}
@@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int mirror_index;
int page_num;
int success;
+ bool full_stripe_locked;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
+ /*
+ * For RAID5/6, race can happen for a different device scrub thread.
+ * For data corruption, Parity and Data threads will both try
+ * to recovery the data.
+ * Race can lead to doubly added csum error, or even unrecoverable
+ * error.
+ */
+ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+ if (ret < 0) {
+ spin_lock(&sctx->stat_lock);
+ if (ret == -ENOMEM)
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return ret;
+ }
+
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
sblocks_for_recheck = NULL;
goto nodatasum_case;
@@ -1241,7 +1485,7 @@ out:
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
sblock->pagev[page_index]->recover =
NULL;
}
@@ -1251,6 +1495,9 @@ out:
kfree(sblocks_for_recheck);
}
+ ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ if (ret < 0)
+ return ret;
return 0;
}
@@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio, 0, 1);
+ logical, &mapped_length, &bbio);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
- atomic_set(&recover->refs, 1);
+ refcount_set(&recover->refs, 1);
recover->bbio = bbio;
recover->map_length = mapped_length;
@@ -1365,7 +1615,7 @@ leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
scrub_page_get(page);
@@ -1407,7 +1657,7 @@ leave_nomem:
scrub_get_recover(recover);
page->recover = recover;
}
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
page_index++;
@@ -1418,14 +1668,14 @@ leave_nomem:
struct scrub_bio_ret {
struct completion event;
- int error;
+ blk_status_t status;
};
static void scrub_bio_wait_endio(struct bio *bio)
{
struct scrub_bio_ret *ret = bio->bi_private;
- ret->error = bio->bi_error;
+ ret->status = bio->bi_status;
complete(&ret->event);
}
@@ -1443,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
int ret;
init_completion(&done.event);
- done.error = 0;
+ done.status = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
@@ -1455,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
return ret;
wait_for_completion(&done.event);
- if (done.error)
+ if (done.status)
return -EIO;
return 0;
@@ -1487,24 +1737,23 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!page->page);
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- continue;
- }
+ bio = btrfs_io_bio_alloc(1);
bio->bi_bdev = page->dev->bdev;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
- if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (btrfsic_submit_bio_wait(bio))
+ if (btrfsic_submit_bio_wait(bio)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
}
bio_put(bio);
@@ -1576,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ bio = btrfs_io_bio_alloc(1);
bio->bi_bdev = page_bad->dev->bdev;
bio->bi_iter.bi_sector = page_bad->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -1634,7 +1881,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
- memset(mapped_buffer, 0, PAGE_SIZE);
+ clear_page(mapped_buffer);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
@@ -1644,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
- struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
int ret;
- mutex_lock(&wr_ctx->wr_lock);
+ mutex_lock(&sctx->wr_lock);
again:
- if (!wr_ctx->wr_curr_bio) {
- wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+ if (!sctx->wr_curr_bio) {
+ sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
- if (!wr_ctx->wr_curr_bio) {
- mutex_unlock(&wr_ctx->wr_lock);
+ if (!sctx->wr_curr_bio) {
+ mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
}
- wr_ctx->wr_curr_bio->sctx = sctx;
- wr_ctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sctx = sctx;
+ sctx->wr_curr_bio->page_count = 0;
}
- sbio = wr_ctx->wr_curr_bio;
+ sbio = sctx->wr_curr_bio;
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
- sbio->dev = wr_ctx->tgtdev;
+ sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_KERNEL,
- wr_ctx->pages_per_wr_bio);
- if (!bio) {
- mutex_unlock(&wr_ctx->wr_lock);
- return -ENOMEM;
- }
+ bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
@@ -1683,7 +1924,7 @@ again:
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- sbio->err = 0;
+ sbio->status = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical_for_dev_replace ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -1697,7 +1938,7 @@ again:
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
- mutex_unlock(&wr_ctx->wr_lock);
+ mutex_unlock(&sctx->wr_lock);
return -EIO;
}
scrub_wr_submit(sctx);
@@ -1707,23 +1948,22 @@ again:
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
- if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+ if (sbio->page_count == sctx->pages_per_wr_bio)
scrub_wr_submit(sctx);
- mutex_unlock(&wr_ctx->wr_lock);
+ mutex_unlock(&sctx->wr_lock);
return 0;
}
static void scrub_wr_submit(struct scrub_ctx *sctx)
{
- struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
- if (!wr_ctx->wr_curr_bio)
+ if (!sctx->wr_curr_bio)
return;
- sbio = wr_ctx->wr_curr_bio;
- wr_ctx->wr_curr_bio = NULL;
+ sbio = sctx->wr_curr_bio;
+ sctx->wr_curr_bio = NULL;
WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
@@ -1738,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
- sbio->err = bio->bi_error;
+ sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -1753,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
int i;
WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
- if (sbio->err) {
+ if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
@@ -1827,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
- len = sctx->sectorsize;
+ len = sctx->fs_info->sectorsize;
index = 0;
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1892,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->nodesize - BTRFS_CSUM_SIZE;
+ len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
index = 0;
@@ -1998,12 +2238,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->refs);
+ refcount_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->refs)) {
+ if (refcount_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2075,10 +2315,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_KERNEL,
- sctx->pages_per_rd_bio);
- if (!bio)
- return -ENOMEM;
+ bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
@@ -2087,7 +2324,7 @@ again:
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- sbio->err = 0;
+ sbio->status = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -2123,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- if (bio->bi_error)
+ if (bio->bi_status)
sblock->no_io_error_seen = 0;
bio_put(bio);
@@ -2166,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
scrub_block_put(sblock);
if (sctx->is_dev_replace &&
- atomic_read(&sctx->wr_ctx.flush_all_writes)) {
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ atomic_read(&sctx->flush_all_writes)) {
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
}
scrub_pending_bio_dec(sctx);
@@ -2187,8 +2424,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
int ret;
int i;
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2203,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
goto bbio_out;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- goto bbio_out;
-
+ bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -2231,6 +2466,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2255,7 +2491,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2332,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio)
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
- sbio->err = bio->bi_error;
+ sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2345,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
int i;
BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
- if (sbio->err) {
+ if (sbio->status) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -2372,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
spin_unlock(&sctx->list_lock);
if (sctx->is_dev_replace &&
- atomic_read(&sctx->wr_ctx.flush_all_writes)) {
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ atomic_read(&sctx->flush_all_writes)) {
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
}
scrub_pending_bio_dec(sctx);
@@ -2385,7 +2621,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
- u32 offset;
+ u64 offset;
int nsectors;
int sectorsize = sparity->sctx->fs_info->sectorsize;
@@ -2395,8 +2631,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
start -= sparity->logic_start;
- start = div_u64_rem(start, sparity->stripe_len, &offset);
- offset /= sectorsize;
+ start = div64_u64_rem(start, sparity->stripe_len, &offset);
+ offset = div_u64(offset, sectorsize);
nsectors = (int)len / sectorsize;
if (offset + nsectors <= sparity->nsectors) {
@@ -2470,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
if (!sum)
return 0;
- index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
- num_sectors = sum->len / sctx->sectorsize;
+ index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize;
+ num_sectors = sum->len / sctx->fs_info->sectorsize;
memcpy(csum, sum->sums + index, sctx->csum_size);
if (index == num_sectors - 1) {
list_del(&sum->list);
@@ -2490,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->nodesize;
+ blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
@@ -2555,7 +2791,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2636,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->nodesize;
+ blocksize = sctx->fs_info->nodesize;
} else {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
@@ -2694,7 +2930,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
- stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div64_u64(*offset, map->stripe_len);
stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
@@ -2748,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio)
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
- if (bio->bi_error)
+ if (bio->bi_status)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -2765,7 +3001,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
u64 length;
int ret;
@@ -2775,15 +3010,14 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto out;
length = sparity->logic_end - sparity->logic_start;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- goto bbio_out;
-
+ bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
@@ -2795,9 +3029,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (!rbio)
goto rbio_out;
- list_for_each_entry(spage, &sparity->spages, list)
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
return;
@@ -2805,6 +3036,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -2822,12 +3054,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->refs);
+ refcount_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->refs))
+ if (!refcount_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2879,7 +3111,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->refs, 1);
+ refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3050,9 +3282,9 @@ out:
logic_end - logic_start);
scrub_parity_put(sparity);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
btrfs_release_path(path);
return ret < 0 ? ret : 0;
@@ -3098,7 +3330,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
- nstripes = div_u64(length, map->stripe_len);
+ nstripes = div64_u64(length, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
@@ -3208,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
if (atomic_read(&fs_info->scrub_pause_req)) {
/* push queued extents */
- atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ atomic_set(&sctx->flush_all_writes, 1);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_set(&sctx->flush_all_writes, 0);
scrub_blocked_if_needed(fs_info);
}
@@ -3422,9 +3654,9 @@ skip:
out:
/* push queued extents */
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
blk_finish_plug(&plug);
btrfs_free_path(path);
@@ -3604,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
- ret = btrfs_wait_ordered_roots(fs_info, -1,
+ ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
cache->key.objectid,
cache->key.offset);
if (ret > 0) {
@@ -3661,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* write requests are really completed when bios_in_flight
* changes to 0.
*/
- atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ atomic_set(&sctx->flush_all_writes, 1);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
@@ -3679,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_set(&sctx->flush_all_writes, 0);
scrub_pause_off(fs_info);
@@ -4082,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
btrfs_put_bbio(bbio);
}
-static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
- struct btrfs_device *dev,
- int is_dev_replace)
-{
- WARN_ON(wr_ctx->wr_curr_bio != NULL);
-
- mutex_init(&wr_ctx->wr_lock);
- wr_ctx->wr_curr_bio = NULL;
- if (!is_dev_replace)
- return 0;
-
- WARN_ON(!dev->bdev);
- wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
- wr_ctx->tgtdev = dev;
- atomic_set(&wr_ctx->flush_all_writes, 0);
- return 0;
-}
-
-static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
-{
- mutex_lock(&wr_ctx->wr_lock);
- kfree(wr_ctx->wr_curr_bio);
- wr_ctx->wr_curr_bio = NULL;
- mutex_unlock(&wr_ctx->wr_lock);
-}
-
static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace)
{
@@ -4410,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
struct btrfs_device *dev;
int ret;
- dev = sctx->wr_ctx.tgtdev;
+ dev = sctx->wr_tgtdev;
if (!dev)
return -EIO;
if (!dev->bdev) {
@@ -4418,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
"scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- return -ENOMEM;
- }
+ bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a60d5bfb8a49..e937c10b8287 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
}
+ ret = btrfs_is_name_len_valid(eb, path->slots[0],
+ (unsigned long)(di + 1), name_len + data_len);
+ if (!ret) {
+ ret = -EIO;
+ goto out;
+ }
if (name_len + data_len > buf_len) {
buf_len = name_len + data_len;
if (is_vmalloc_addr(buf)) {
@@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
buf = tmp;
}
if (!buf) {
- buf = vmalloc(buf_len);
+ buf = kvmalloc(buf_len, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
goto out;
@@ -2769,15 +2775,20 @@ out:
struct recorded_ref {
struct list_head list;
- char *dir_path;
char *name;
struct fs_path *full_path;
u64 dir;
u64 dir_gen;
- int dir_path_len;
int name_len;
};
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
+}
+
/*
* We need to process new refs before deleted refs, but compare_tree gives us
* everything mixed. So we first record all refs and later process them.
@@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir,
ref->dir = dir;
ref->dir_gen = dir_gen;
- ref->full_path = path;
-
- ref->name = (char *)kbasename(ref->full_path->start);
- ref->name_len = ref->full_path->end - ref->name;
- ref->dir_path = ref->full_path->start;
- if (ref->name == ref->full_path->start)
- ref->dir_path_len = 0;
- else
- ref->dir_path_len = ref->full_path->end -
- ref->full_path->start - 1 - ref->name_len;
-
+ set_ref_path(ref, path);
list_add_tail(&ref->list, head);
return 0;
}
@@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root,
struct fs_path *fs_path)
{
u64 ino = ino2;
+ bool free_path = false;
+ int ret = 0;
+
+ if (!fs_path) {
+ fs_path = fs_path_alloc();
+ if (!fs_path)
+ return -ENOMEM;
+ free_path = true;
+ }
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
- int ret;
u64 parent;
u64 parent_gen;
@@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root,
if (ret < 0) {
if (ret == -ENOENT && ino == ino2)
ret = 0;
- return ret;
+ goto out;
+ }
+ if (parent == ino1) {
+ ret = parent_gen == ino1_gen ? 1 : 0;
+ goto out;
}
- if (parent == ino1)
- return parent_gen == ino1_gen ? 1 : 0;
ino = parent;
}
- return 0;
+ out:
+ if (free_path)
+ fs_path_free(fs_path);
+ return ret;
}
static int wait_for_parent_move(struct send_ctx *sctx,
@@ -3686,6 +3700,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
int is_orphan = 0;
u64 last_dir_ino_rm = 0;
bool can_rename = true;
+ bool orphanized_ancestor = false;
btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
@@ -3837,9 +3852,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* might contain the pre-orphanization name of
* ow_inode, which is no longer valid.
*/
- fs_path_reset(valid_path);
- ret = get_cur_path(sctx, sctx->cur_ino,
- sctx->cur_inode_gen, valid_path);
+ ret = is_ancestor(sctx->parent_root,
+ ow_inode, ow_gen,
+ sctx->cur_ino, NULL);
+ if (ret > 0) {
+ orphanized_ancestor = true;
+ fs_path_reset(valid_path);
+ ret = get_cur_path(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ valid_path);
+ }
if (ret < 0)
goto out;
} else {
@@ -3960,6 +3982,43 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (!ret) {
+ /*
+ * If we orphanized any ancestor before, we need
+ * to recompute the full path for deleted names,
+ * since any such path was computed before we
+ * processed any references and orphanized any
+ * ancestor inode.
+ */
+ if (orphanized_ancestor) {
+ struct fs_path *new_path;
+
+ /*
+ * Our reference's name member points to
+ * its full_path member string, so we
+ * use here a new path.
+ */
+ new_path = fs_path_alloc();
+ if (!new_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, cur->dir,
+ cur->dir_gen,
+ new_path);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ goto out;
+ }
+ ret = fs_path_add(new_path,
+ cur->name,
+ cur->name_len);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ goto out;
+ }
+ fs_path_free(cur->full_path);
+ set_ref_path(cur, new_path);
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -5184,13 +5243,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
while (key.offset < ekey->offset + left_len) {
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
- if (right_type != BTRFS_FILE_EXTENT_REG) {
+ if (right_type != BTRFS_FILE_EXTENT_REG &&
+ right_type != BTRFS_FILE_EXTENT_INLINE) {
ret = 0;
goto out;
}
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- right_len = btrfs_file_extent_num_bytes(eb, ei);
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ right_len = btrfs_file_extent_inline_len(eb, slot, ei);
+ right_len = PAGE_ALIGN(right_len);
+ } else {
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ }
right_offset = btrfs_file_extent_offset(eb, ei);
right_gen = btrfs_file_extent_generation(eb, ei);
@@ -5204,6 +5269,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
+ /*
+ * We just wanted to see if when we have an inline extent, what
+ * follows it is a regular extent (wanted to check the above
+ * condition for inline extents too). This should normally not
+ * happen but it's possible for example when we have an inline
+ * compressed extent representing data with a size matching
+ * the page size (currently the same as sector size).
+ */
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
left_offset_fixed = left_offset;
if (key.offset < ekey->offset) {
/* Fix the right offset for 2a and 7. */
@@ -6360,22 +6438,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
if (!sctx->send_buf) {
- sctx->send_buf = vmalloc(sctx->send_max_size);
- if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
- sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
+ sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
if (!sctx->read_buf) {
- sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
sctx->pending_dir_moves = RB_ROOT;
@@ -6384,25 +6456,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+ sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
if (!sctx->clone_roots) {
- sctx->clone_roots = vzalloc(alloc_size);
- if (!sctx->clone_roots) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
if (arg->clone_sources_count) {
- clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+ clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
if (!clone_sources_tmp) {
- clone_sources_tmp = vmalloc(alloc_size);
- if (!clone_sources_tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 72a053c9a7f0..74e47794e63f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_alloc_start:
- num = match_strdup(&args[0]);
- if (num) {
- mutex_lock(&info->chunk_mutex);
- info->alloc_start = memparse(num, NULL);
- mutex_unlock(&info->chunk_mutex);
- kfree(num);
- btrfs_info(info, "allocations start at %llu",
- info->alloc_start);
- } else {
- ret = -ENOMEM;
- goto out;
- }
+ btrfs_info(info,
+ "option alloc_start is obsolete, ignored");
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -1187,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1232,8 +1222,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
- if (info->alloc_start != 0)
- seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -1716,7 +1704,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
- u64 old_alloc_start = fs_info->alloc_start;
int old_thread_pool_size = fs_info->thread_pool_size;
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
@@ -1795,8 +1782,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (fs_info->fs_devices->missing_devices >
- fs_info->num_tolerated_disk_barrier_failures &&
- !(*flags & MS_RDONLY)) {
+ fs_info->num_tolerated_disk_barrier_failures) {
btrfs_warn(fs_info,
"too many missing devices, writeable remount is not allowed");
ret = -EACCES;
@@ -1856,9 +1842,6 @@ restore:
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
- mutex_lock(&fs_info->chunk_mutex);
- fs_info->alloc_start = old_alloc_start;
- mutex_unlock(&fs_info->chunk_mutex);
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
@@ -1899,18 +1882,15 @@ static inline void btrfs_descending_sort_devices(
static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 skip_space;
u64 type;
u64 avail_space;
- u64 used_space;
u64 min_stripe_size;
int min_stripes = 1, num_stripes = 1;
int i = 0, nr_devices;
- int ret;
/*
* We aren't under the device list lock, so this is racy-ish, but good
@@ -1928,12 +1908,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
}
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
- GFP_NOFS);
+ GFP_KERNEL);
if (!devices_info)
return -ENOMEM;
/* calc min stripe number for data space allocation */
- type = btrfs_get_alloc_profile(root, 1);
+ type = btrfs_data_alloc_profile(fs_info);
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
num_stripes = nr_devices;
@@ -1950,8 +1930,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
else
min_stripe_size = BTRFS_STRIPE_LEN;
- if (fs_info->alloc_start)
- mutex_lock(&fs_devices->device_list_mutex);
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev ||
@@ -1974,34 +1952,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
*/
skip_space = SZ_1M;
- /* user can set the offset in fs_info->alloc_start. */
- if (fs_info->alloc_start &&
- fs_info->alloc_start + BTRFS_STRIPE_LEN <=
- device->total_bytes) {
- rcu_read_unlock();
- skip_space = max(fs_info->alloc_start, skip_space);
-
- /*
- * btrfs can not use the free space in
- * [0, skip_space - 1], we must subtract it from the
- * total. In order to implement it, we account the used
- * space in this range first.
- */
- ret = btrfs_account_dev_extents_size(device, 0,
- skip_space - 1,
- &used_space);
- if (ret) {
- kfree(devices_info);
- mutex_unlock(&fs_devices->device_list_mutex);
- return ret;
- }
-
- rcu_read_lock();
-
- /* calc the free space in [0, skip_space - 1] */
- skip_space -= used_space;
- }
-
/*
* we can use the free space in [0, skip_space - 1], subtract
* it from the total.
@@ -2020,8 +1970,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
i++;
}
rcu_read_unlock();
- if (fs_info->alloc_start)
- mutex_unlock(&fs_devices->device_list_mutex);
nr_devices = i;
@@ -2058,10 +2006,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
* multiplier to scale the sizes.
*
* Unused device space usage is based on simulating the chunk allocator
- * algorithm that respects the device sizes, order of allocations and the
- * 'alloc_start' value, this is a close approximation of the actual use but
- * there are other factors that may change the result (like a new metadata
- * chunk).
+ * algorithm that respects the device sizes and order of allocations. This is
+ * a close approximation of the actual use but there are other factors that may
+ * change the result (like a new metadata chunk).
*
* If metadata is exhausted, f_bavail will be 0.
*/
@@ -2244,7 +2191,7 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- fs_info->fs_frozen = 1;
+ set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
* We don't need a barrier here, we'll wait for any transaction that
* could be in progress on other threads (and do delayed iputs that
@@ -2263,7 +2210,9 @@ static int btrfs_freeze(struct super_block *sb)
static int btrfs_unfreeze(struct super_block *sb)
{
- btrfs_sb(sb)->fs_frozen = 0;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fba8940..c2d5f3580b4c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+static ssize_t quota_override_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int quota_override;
+
+ quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long knob;
+ int err;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ if (knob)
+ set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ else
+ clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
BTRFS_ATTR_PTR(clone_alignment),
+ BTRFS_ATTR_PTR(quota_override),
NULL,
};
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ea272432c930..b18ab8f327a5 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
trans->type = __TRANS_DUMMY;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 133753232a94..d06b1c931d05 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize)
return -ENOMEM;
}
- extent_io_tree_init(&tmp, &inode->i_data);
+ extent_io_tree_init(&tmp, inode);
/*
* First go through and create and mark all of our pages dirty, we pin
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 61b807de3e16..f615d59b0489 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(atomic_read(&transaction->use_count) == 0);
- if (atomic_dec_and_test(&transaction->use_count)) {
+ WARN_ON(refcount_read(&transaction->use_count) == 0);
+ if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
if (transaction->delayed_refs.pending_csums)
@@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
btrfs_put_block_group_trimming(cache);
btrfs_put_block_group(cache);
}
- kmem_cache_free(btrfs_transaction_cachep, transaction);
+ kfree(transaction);
}
}
@@ -207,7 +207,7 @@ loop:
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
@@ -228,7 +228,7 @@ loop:
*/
BUG_ON(type == TRANS_JOIN_NOLOCK);
- cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+ cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -238,11 +238,11 @@ loop:
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ kfree(cur_trans);
goto loop;
} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ kfree(cur_trans);
return -EROFS;
}
@@ -257,7 +257,7 @@ loop:
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
- atomic_set(&cur_trans->use_count, 2);
+ refcount_set(&cur_trans->use_count, 2);
atomic_set(&cur_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
@@ -294,7 +294,7 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
- fs_info->btree_inode->i_mapping);
+ fs_info->btree_inode);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->trans_lock);
cur_trans = fs_info->running_transaction;
if (cur_trans && is_transaction_blocked(cur_trans)) {
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_event(fs_info->transaction_wait,
@@ -572,7 +572,6 @@ again:
h->type = type;
h->can_flush_pending_bgs = true;
- INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
@@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
list_for_each_entry(t, &fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = 0;
break;
}
@@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
break;
}
}
@@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up_process(info->transaction_kthread);
err = -EIO;
}
- assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
if (must_run_delayed_refs) {
@@ -1376,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = commit_fs_roots(trans, fs_info);
if (ret)
goto out;
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret < 0)
- goto out;
ret = btrfs_qgroup_account_extents(trans, fs_info);
if (ret < 0)
goto out;
@@ -1839,7 +1834,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
/* take transaction reference */
cur_trans = trans->transaction;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
btrfs_end_transaction(trans);
@@ -1928,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static inline void
@@ -2015,7 +2010,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&fs_info->trans_lock);
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans);
@@ -2035,7 +2030,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (prev_trans->state != TRANS_STATE_COMPLETED) {
- atomic_inc(&prev_trans->use_count);
+ refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
@@ -2130,13 +2125,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
- /* Reocrd old roots for later qgroup accounting */
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
-
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2179,6 +2167,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
+ * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
+ * new delayed refs. Must handle them or qgroup can be wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
@@ -2223,7 +2222,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
switch_commit_roots(cur_trans, fs_info);
- assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(fs_info);
@@ -2306,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* it'll result in deadlock about SB_FREEZE_FS.
*/
if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread && !fs_info->fs_frozen)
+ current != fs_info->cleaner_kthread &&
+ !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
btrfs_run_delayed_iputs(fs_info);
return ret;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5dfb5590fff6..c55e44560103 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -18,6 +18,8 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
+
+#include <linux/refcount.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
@@ -49,7 +51,7 @@ struct btrfs_transaction {
* transaction can end
*/
atomic_t num_writers;
- atomic_t use_count;
+ refcount_t use_count;
atomic_t pending_ordered;
unsigned long flags;
@@ -125,8 +127,6 @@ struct btrfs_trans_handle {
unsigned int type;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
- struct seq_list delayed_ref_elem;
- struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a59674c3e69e..f20ef211a73d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1175,15 +1175,19 @@ next:
return 0;
}
-static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
- u64 *parent_objectid)
+static int extref_get_fields(struct extent_buffer *eb, int slot,
+ unsigned long ref_ptr, u32 *namelen, char **name,
+ u64 *index, u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ref_ptr;
*namelen = btrfs_inode_extref_name_len(eb, extref);
+ if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
+ *namelen))
+ return -EIO;
+
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
return 0;
}
-static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+static int ref_get_fields(struct extent_buffer *eb, int slot,
+ unsigned long ref_ptr, u32 *namelen, char **name,
+ u64 *index)
{
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ref_ptr;
*namelen = btrfs_inode_ref_name_len(eb, ref);
+ if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
+ *namelen))
+ return -EIO;
+
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index, &parent_objectid);
+ ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
+ &name, &ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
@@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
+ &name, &ref_index);
}
if (ret)
goto out;
@@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, di))
+ if (verify_dir_item(fs_info, eb, slot, di))
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
@@ -2017,7 +2026,7 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, di)) {
+ if (verify_dir_item(fs_info, eb, slot, di)) {
ret = -EIO;
goto out;
}
@@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const u64 ino)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
@@ -2143,6 +2153,12 @@ process_leaf:
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
+ ret = verify_dir_item(fs_info, path->nodes[0],
+ path->slots[0], di);
+ if (ret) {
+ ret = -EIO;
+ goto out;
+ }
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
@@ -4196,7 +4212,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
/* Need a ref to keep it from getting evicted from cache */
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
num++;
@@ -4546,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
this_len = sizeof(*extref) + this_name_len;
}
+ ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
+ this_name_len);
+ if (!ret) {
+ ret = -EIO;
+ goto out;
+ }
if (this_name_len > name_len) {
char *new_name;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab8a66d852f9..5eb7217738ed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret,
+ int mirror_num, int need_raid_map);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -237,6 +242,17 @@ static struct btrfs_device *__alloc_device(void)
if (!dev)
return ERR_PTR(-ENOMEM);
+ /*
+ * Preallocate a bio that's always going to be used for flushing device
+ * barriers and matches the device lifespan
+ */
+ dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ if (!dev->flush_bio) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ bio_get(dev->flush_bio);
+
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->resized_list);
@@ -833,6 +849,7 @@ static void __free_device(struct work_struct *work)
device = container_of(work, struct btrfs_device, rcu_work);
rcu_string_free(device->name);
+ bio_put(device->flush_bio);
kfree(device);
}
@@ -1008,14 +1025,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
- fs_devices->rotating = 1;
-
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -1349,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
- u64 min_search_start;
/*
* We don't want to overwrite the superblock on the drive nor any area
* used by the boot loader (grub for example), so we make sure to start
* at an offset of at least 1MB.
*/
- min_search_start = max(fs_info->alloc_start, 1024ull * 1024);
- search_start = max(search_start, min_search_start);
+ search_start = max_t(u64, search_start, SZ_1M);
path = btrfs_alloc_path();
if (!path)
@@ -2383,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
+ device->total_bytes = round_down(i_size_read(bdev->bd_inode),
+ fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
@@ -2413,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->fs_devices->total_devices++;
fs_info->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += device->total_bytes;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
- tmp + device->total_bytes);
+ round_down(tmp + device->total_bytes, fs_info->sectorsize));
tmp = btrfs_super_num_devices(fs_info->super_copy);
btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
@@ -2570,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
- name = rcu_string_strdup(device_path, GFP_NOFS);
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
kfree(device);
ret = -ENOMEM;
@@ -2685,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
if (!device->writeable)
return -EACCES;
+ new_size = round_down(new_size, fs_info->sectorsize);
+
mutex_lock(&fs_info->chunk_mutex);
old_total = btrfs_super_total_bytes(super_copy);
diff = new_size - device->total_bytes;
@@ -2697,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
fs_devices = fs_info->fs_devices;
- btrfs_set_super_total_bytes(super_copy, old_total + diff);
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
@@ -2795,10 +2811,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
return ret;
}
+static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ logical, length);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info,
+ "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
+ logical, length, em->start, em->start + em->len);
+ free_extent_map(em);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* callers are responsible for dropping em's ref. */
+ return em;
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
@@ -2806,23 +2850,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em_tree = &fs_info->mapping_tree.map_tree;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset) {
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- if (em)
- free_extent_map(em);
- return -EINVAL;
+ return PTR_ERR(em);
}
map = em->map_lookup;
mutex_lock(&fs_info->chunk_mutex);
@@ -2850,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += dev_extent_len;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
@@ -3736,7 +3770,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
/* Non-zero return value signifies invalidity */
@@ -3755,6 +3789,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
@@ -3851,11 +3886,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
- btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- bctl->meta.target, bctl->data.target);
+ meta_target, data_target);
}
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
@@ -3910,7 +3950,7 @@ out:
__cancel_balance(fs_info);
else {
kfree(bctl);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
return ret;
}
@@ -4000,7 +4040,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -4363,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
- u64 diff = old_size - new_size;
+ u64 diff;
+
+ new_size = round_down(new_size, fs_info->sectorsize);
+ diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -4379,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space -= diff;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_sub(diff, &fs_info->free_chunk_space);
}
mutex_unlock(&fs_info->chunk_mutex);
@@ -4492,7 +4533,8 @@ again:
&fs_info->fs_devices->resized_devices);
WARN_ON(diff > old_total);
- btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
/* Now btrfs_update_device() will change the on-disk size. */
@@ -4505,9 +4547,7 @@ done:
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += diff;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -4785,7 +4825,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- stripe_size = div_u64(stripe_size, raid_stripe_len);
+ stripe_size = div64_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4833,7 +4873,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
list_add_tail(&em->list, &trans->transaction->pending_chunks);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
}
write_unlock(&em_tree->lock);
if (ret) {
@@ -4852,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
}
- spin_lock(&info->free_chunk_lock);
- info->free_chunk_space -= (stripe_size * map->num_stripes);
- spin_unlock(&info->free_chunk_lock);
+ atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
@@ -4888,7 +4926,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
@@ -4897,24 +4934,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em_tree = &fs_info->mapping_tree.map_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
- chunk_offset, chunk_size);
- return -EINVAL;
- }
-
- if (em->start != chunk_offset || em->len != chunk_size) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
- chunk_offset, chunk_size, em->start, em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
@@ -5015,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
int ret;
chunk_offset = find_next_chunk(fs_info);
- alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+ alloc_profile = btrfs_metadata_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
if (ret)
return ret;
sys_chunk_offset = find_next_chunk(fs_info);
- alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+ alloc_profile = btrfs_system_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
return ret;
}
@@ -5055,15 +5076,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- read_unlock(&map_tree->map_tree.lock);
- if (!em)
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em))
return 1;
map = em->map_lookup;
@@ -5117,34 +5135,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
-
- /*
- * We could return errors for these cases, but that could get ugly and
- * we'd probably do the same thing which is just not do anything else
- * and exit, so return 1 so the callers don't try to use other copies.
- */
- if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
- logical+len);
- return 1;
- }
-
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu",
- logical, logical+len, em->start,
- em->start + em->len);
- free_extent_map(em);
+ em = get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ /*
+ * We could return errors for these cases, but that could get
+ * ugly and we'd probably do the same thing which is just not do
+ * anything else and exit, so return 1 so the callers don't try
+ * to use other copies.
+ */
return 1;
- }
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
@@ -5160,7 +5163,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+ fs_info->dev_replace.tgtdev)
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -5173,15 +5177,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
unsigned long len = fs_info->sectorsize;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
@@ -5189,20 +5189,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
@@ -5295,25 +5291,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
- atomic_set(&bbio->refs, 1);
+ refcount_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
- WARN_ON(!atomic_read(&bbio->refs));
- atomic_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bbio->refs));
+ refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
- if (atomic_dec_and_test(&bbio->refs))
+ if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
+/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ struct btrfs_bio **bbio_ret)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_bio *bbio;
+ u64 offset;
+ u64 stripe_nr;
+ u64 stripe_nr_end;
+ u64 stripe_end_offset;
+ u64 stripe_cnt;
+ u64 stripe_len;
+ u64 stripe_offset;
+ u64 num_stripes;
+ u32 stripe_index;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+ int ret = 0;
+ int i;
+
+ /* discard always return a bbio */
+ ASSERT(bbio_ret);
+
+ em = get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ offset = logical - em->start;
+ length = min_t(u64, em->len - offset, length);
+
+ stripe_len = map->stripe_len;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(offset, stripe_len);
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_nr * stripe_len;
+
+ stripe_nr_end = round_up(offset + length, map->stripe_len);
+ stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_cnt = stripe_nr_end - stripe_nr;
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + length);
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ num_stripes = min_t(u64, map->num_stripes,
+ sub_stripes * stripe_cnt);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = map->num_stripes;
+ } else {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ }
+
+ bbio = alloc_btrfs_bio(num_stripes, 0);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ bbio->stripes[i].length -=
+ stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ bbio->stripes[i].length -=
+ stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else {
+ bbio->stripes[i].length = length;
+ }
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+
+ *bbio_ret = bbio;
+ bbio->map_type = map->type;
+ bbio->num_stripes = num_stripes;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ u64 srcdev_devid, int *mirror_num,
+ u64 *physical)
+{
+ struct btrfs_bio *bbio = NULL;
+ int num_stripes;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+ int i;
+ int ret = 0;
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &length, &bbio, 0, 0);
+ if (ret) {
+ ASSERT(bbio == NULL);
+ return ret;
+ }
+
+ num_stripes = bbio->num_stripes;
+ if (*mirror_num > num_stripes) {
+ /*
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+ * that means that the requested area is not left of the left
+ * cursor
+ */
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num of the source
+ * drive. Therefore look it up first. At the end, patch the device
+ * pointer to the one of the target drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add the
+ * mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+
+ btrfs_put_bbio(bbio);
+
+ ASSERT(found);
+ if (!found)
+ return -EIO;
+
+ *mirror_num = index_srcdev + 1;
+ *physical = physical_of_found;
+ return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+ struct btrfs_bio **bbio_ret,
+ struct btrfs_dev_replace *dev_replace,
+ int *num_stripes_ret, int *max_errors_ret)
+{
+ struct btrfs_bio *bbio = *bbio_ret;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int tgtdev_indexes = 0;
+ int num_stripes = *num_stripes_ret;
+ int max_errors = *max_errors_ret;
+ int i;
+
+ if (op == BTRFS_MAP_WRITE) {
+ int index_where_to_add;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk to
+ * the new disk takes place at run time while the filesystem is
+ * mounted writable, the regular write operations to the old
+ * disk have to be duplicated to go to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that
+ * the write to the old disk is already set up in the stripes
+ * array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can also
+ * be used to read data in case it is needed to repair a corrupt
+ * block elsewhere. This is possible if the requested area is
+ * left of the left cursor. In this area, the target drive is a
+ * full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it simple,
+ * only add the mirror with the lowest physical
+ * address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+
+ *num_stripes_ret = num_stripes;
+ *max_errors_ret = max_errors;
+ bbio->num_tgtdevs = tgtdev_indexes;
+ *bbio_ret = bbio;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@@ -5322,14 +5646,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
- u64 stripe_end_offset;
u64 stripe_nr;
- u64 stripe_nr_orig;
- u64 stripe_nr_end;
u64 stripe_len;
u32 stripe_index;
int i;
@@ -5345,23 +5664,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, *length);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu len %llu",
- logical, *length);
- return -EINVAL;
- }
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ *length, bbio_ret);
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu, found %Lu-%Lu",
- logical, em->start, em->start + em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
offset = logical - em->start;
@@ -5400,14 +5709,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
raid56_full_stripe_start *= full_stripe_len;
}
- if (op == BTRFS_MAP_DISCARD) {
- /* we don't discard raid56 yet */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- *length = min_t(u64, em->len - offset, *length);
- } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
@@ -5438,105 +5740,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
- /*
- * in dev-replace case, for repair case (that's the only
- * case where the mirror is selected explicitly when
- * calling btrfs_map_block), blocks left of the left cursor
- * can also be read from the target drive.
- * For REQ_GET_READ_MIRRORS, the target drive is added as
- * the last one to the array of stripes. For READ, it also
- * needs to be supported using the same mirror number.
- * If the requested block is not left of the left cursor,
- * EIO is returned. This can happen because btrfs_num_copies()
- * returns one more in the dev-replace case.
- */
- u64 tmp_length = *length;
- struct btrfs_bio *tmp_bbio = NULL;
- int tmp_num_stripes;
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, 0);
- if (ret) {
- WARN_ON(tmp_bbio != NULL);
- goto out;
- }
-
- tmp_num_stripes = tmp_bbio->num_stripes;
- if (mirror_num > tmp_num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this
- * mirror, that means that the requested area
- * is not left of the left cursor
- */
- ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
- goto out;
- }
-
- /*
- * process the rest of the function using the mirror_num
- * of the source drive. Therefore look it up first.
- * At the end, patch the device pointer to the one of the
- * target drive.
- */
- for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add
- * the mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= tmp_bbio->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = tmp_bbio->stripes[i].physical;
- }
-
- btrfs_put_bbio(tmp_bbio);
-
- if (!found) {
- WARN_ON(1);
- ret = -EIO;
+ !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+ ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+ dev_replace->srcdev->devid,
+ &mirror_num,
+ &physical_to_patch_in_first_stripe);
+ if (ret)
goto out;
- }
-
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
+ else
+ patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
- stripe_nr_orig = stripe_nr;
- stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
- stripe_end_offset = stripe_nr_end * map->stripe_len -
- (offset + *length);
-
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->num_stripes,
- stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5549,8 +5774,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5566,10 +5790,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->sub_stripes *
- (stripe_nr_end - stripe_nr_orig),
- map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -5587,7 +5807,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
(op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
@@ -5612,8 +5832,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
+ if ((op != BTRFS_MAP_WRITE &&
+ op != BTRFS_MAP_GET_READ_MIRRORS) &&
+ mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5635,8 +5856,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+ if (op == BTRFS_MAP_WRITE)
num_alloc_stripes <<= 1;
if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
@@ -5648,14 +5869,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- need_raid_map &&
- ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
- mirror_num > 1)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ (need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5679,173 +5898,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
RAID6_Q_STRIPE;
}
- if (op == BTRFS_MAP_DISCARD) {
- u32 factor = 0;
- u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
- u32 remaining_stripes = 0;
- u32 last_stripe = 0;
-
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- sub_stripes = 1;
- else
- sub_stripes = map->sub_stripes;
-
- factor = map->num_stripes / sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_nr_end -
- stripe_nr_orig,
- factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
- map->stripe_len;
-
- if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
- /*
- * Special for the first stripe and
- * the last stripe:
- *
- * |-------|...|-------|
- * |----------|
- * off end_off
- */
- if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
-
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
-
- if (i == sub_stripes - 1)
- stripe_offset = 0;
- } else
- bbio->stripes[i].length = *length;
-
- stripe_index++;
- if (stripe_index == map->num_stripes) {
- /* This could only happen for RAID0/10 */
- stripe_index = 0;
- stripe_nr++;
- }
- }
- } else {
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
- }
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
- tgtdev_indexes = 0;
- if (dev_replace_is_ongoing &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
- dev_replace->tgtdev != NULL) {
- int index_where_to_add;
- u64 srcdev_devid = dev_replace->srcdev->devid;
-
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk
- * to the new disk takes place at run time while the
- * filesystem is mounted writable, the regular write
- * operations to the old disk have to be duplicated to go
- * to the new disk as well.
- * Note that device->missing is handled by the caller, and
- * that the write to the old disk is already set up in the
- * stripes array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
-
- new->physical = old->physical;
- new->length = old->length;
- new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing &&
- op == BTRFS_MAP_GET_READ_MIRRORS &&
- dev_replace->tgtdev != NULL) {
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- /*
- * During the dev-replace procedure, the target drive can
- * also be used to read data in case it is needed to repair
- * a corrupt block elsewhere. This is possible if the
- * requested area is left of the left cursor. In this area,
- * the target drive is a full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bbio->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ need_full_stripe(op)) {
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
+ &max_errors);
}
*bbio_ret = bbio;
@@ -5853,7 +5926,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
- bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5886,19 +5958,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map)
+ struct btrfs_bio **bbio_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
- mirror_num, need_raid_map);
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
@@ -5908,24 +5976,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 rmap_len;
int i, j, nr = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_start, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_err(fs_info, "couldn't find em for chunk %Lu",
- chunk_start);
+ em = get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
return -EIO;
- }
- if (em->start != chunk_start) {
- btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu",
- em->start, chunk_start);
- free_extent_map(em);
- return -EIO;
- }
map = em->map_lookup;
-
length = em->len;
rmap_len = map->stripe_len;
@@ -5949,7 +6004,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
continue;
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
@@ -5994,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio)
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
- if (bio->bi_error) {
+ if (bio->bi_status) {
atomic_inc(&bbio->error);
- if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
+ if (bio->bi_status == BLK_STS_IOERR ||
+ bio->bi_status == BLK_STS_TARGET) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
@@ -6034,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio)
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- bio->bi_error = 0;
+ bio->bi_status = 0;
}
btrfs_end_bbio(bbio, bio);
@@ -6151,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
btrfs_end_bbio(bbio, bio);
}
}
@@ -6218,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
continue;
}
- if (dev_nr < total_devs - 1) {
- bio = btrfs_bio_clone(first_bio, GFP_NOFS);
- BUG_ON(!bio); /* -ENOMEM */
- } else
+ if (dev_nr < total_devs - 1)
+ bio = btrfs_bio_clone(first_bio);
+ else
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
@@ -6636,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += device->total_bytes -
- device->bytes_used;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(device->total_bytes - device->bytes_used,
+ &fs_info->free_chunk_space);
}
ret = 0;
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 59be81206dd7..6f45fd60d15a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -74,6 +74,8 @@ struct btrfs_device {
int missing;
int can_discard;
int is_tgtdev_for_dev_replace;
+ int last_flush_error;
+ int flush_bio_sent;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
seqcount_t data_seqcount;
@@ -123,7 +125,6 @@ struct btrfs_device {
struct list_head resized_list;
/* for sending down flush barriers */
- int nobarriers;
struct bio *flush_bio;
struct completion flush_wait;
@@ -280,6 +281,11 @@ struct btrfs_io_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
u8 *csum_allocated;
btrfs_io_bio_end_io_t *end_io;
+ struct bvec_iter iter;
+ /*
+ * This member must come last, bio_alloc_bioset will allocate enough
+ * bytes for entire btrfs_io_bio but relies on bio being last.
+ */
struct bio bio;
};
@@ -298,7 +304,7 @@ struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
@@ -400,8 +406,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map);
+ struct btrfs_bio **bbio_ret);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
@@ -475,7 +480,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct btrfs_mapping_tree *map_tree,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b3cbf80c5acf..2c7e53f9ff1b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
- if (verify_dir_item(fs_info, leaf, di)) {
+ if (verify_dir_item(fs_info, leaf, slot, di)) {
ret = -EIO;
goto err;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 135b10823c6d..c248f9286366 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -24,12 +24,13 @@
#include <linux/slab.h>
#include <linux/zlib.h>
#include <linux/zutil.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
+#include <linux/refcount.h>
#include "compression.h"
struct workspace {
@@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->strm.workspace);
+ kvfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
@@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void)
struct workspace *workspace;
int workspacesize;
- workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
- workspace->strm.workspace = vmalloc(workspacesize);
- workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+ workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -211,10 +212,7 @@ out:
return ret;
}
-static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen)
+static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
char *data_in;
size_t total_out = 0;
unsigned long page_in_index = 0;
+ size_t srclen = cb->compressed_len;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
+ struct page **pages_in = cb->compressed_pages;
+ u64 disk_start = cb->start;
+ struct bio *orig_bio = cb->orig_bio;
data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..ea0e05ec2916 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,8 +49,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags,
- struct writeback_control *wbc);
+ enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -179,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost sync page write");
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -353,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
- mapping_set_error(page->mapping, -EIO);
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -482,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
WARN_ON(!bh->b_assoc_map);
- if (buffer_write_io_error(bh))
- set_bit(AS_EIO, &bh->b_assoc_map->flags);
bh->b_assoc_map = NULL;
}
@@ -1182,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh)
}
EXPORT_SYMBOL(mark_buffer_dirty);
+void mark_buffer_write_io_error(struct buffer_head *bh)
+{
+ set_buffer_write_io_error(bh);
+ /* FIXME: do we need to set this in both places? */
+ if (bh->b_page && bh->b_page->mapping)
+ mapping_set_error(bh->b_page->mapping, -EIO);
+ if (bh->b_assoc_map)
+ mapping_set_error(bh->b_assoc_map, -EIO);
+}
+EXPORT_SYMBOL(mark_buffer_write_io_error);
+
/*
* Decrement a buffer_head's reference count. If all buffers against a page
* have zero reference count, are clean and unlocked, and if the page is clean
@@ -1830,7 +1837,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -1884,7 +1892,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -2379,8 +2388,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
- &page, &fsdata);
+ AOP_FLAG_CONT_EXPAND, &page, &fsdata);
if (err)
goto out;
@@ -2415,9 +2423,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -2449,9 +2456,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -3025,11 +3031,11 @@ EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
{
- struct buffer_head tmp;
struct inode *inode = mapping->host;
- tmp.b_state = 0;
- tmp.b_blocknr = 0;
- tmp.b_size = i_blocksize(inode);
+ struct buffer_head tmp = {
+ .b_size = i_blocksize(inode),
+ };
+
get_block(inode, block, &tmp, 0);
return tmp.b_blocknr;
}
@@ -3042,7 +3048,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
- bh->b_end_io(bh, !bio->bi_error);
+ bh->b_end_io(bh, !bio->bi_status);
bio_put(bio);
}
@@ -3095,7 +3101,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags, struct writeback_control *wbc)
+ enum rw_hint write_hint, struct writeback_control *wbc)
{
struct bio *bio;
@@ -3124,13 +3130,13 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
+ bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(op, bio);
@@ -3145,14 +3151,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
return 0;
}
-int _submit_bh(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags)
-{
- return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
-}
-EXPORT_SYMBOL_GPL(_submit_bh);
-
-int submit_bh(int op, int op_flags, struct buffer_head *bh)
+int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
@@ -3291,8 +3290,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = head;
do {
- if (buffer_write_io_error(bh) && page->mapping)
- mapping_set_error(page->mapping, -EIO);
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
@@ -3504,6 +3501,130 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+ loff_t offset = page_offset(page);
+ struct buffer_head *bh, *head;
+ bool seek_data = whence == SEEK_DATA;
+
+ if (lastoff < offset)
+ lastoff = offset;
+
+ bh = head = page_buffers(page);
+ do {
+ offset += bh->b_size;
+ if (lastoff >= offset)
+ continue;
+
+ /*
+ * Unwritten extents that have data in the page cache covering
+ * them can be identified by the BH_Unwritten state flag.
+ * Pages with multiple buffers might have a mix of holes, data
+ * and unwritten extents - any buffer with valid data in it
+ * should have BH_Uptodate flag set on it.
+ */
+
+ if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+ return lastoff;
+
+ lastoff = offset;
+ } while ((bh = bh->b_this_page) != head);
+ return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec, 0);
+
+ do {
+ unsigned want, nr_pages, i;
+
+ want = min_t(unsigned, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * If current page offset is beyond where we've ended,
+ * we've found a hole.
+ */
+ if (whence == SEEK_HOLE &&
+ lastoff < page_offset(page))
+ goto check_range;
+
+ /* Searching done if the page index is out of range. */
+ if (page->index >= end)
+ goto not_found;
+
+ lock_page(page);
+ if (likely(page->mapping == inode->i_mapping) &&
+ page_has_buffers(page)) {
+ lastoff = page_seek_hole_data(page, lastoff, whence);
+ if (lastoff >= 0) {
+ unlock_page(page);
+ goto check_range;
+ }
+ }
+ unlock_page(page);
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+
+ /* Searching done if fewer pages returned than wanted. */
+ if (nr_pages < want)
+ break;
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+
void __init buffer_init(void)
{
unsigned long nrpages;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9bf90bcc56ac..bb3a02ca9da4 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -18,7 +18,7 @@
#include <linux/fscache-cache.h>
#include <linux/timer.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/workqueue.h>
#include <linux/security.h>
@@ -97,7 +97,7 @@ struct cachefiles_cache {
* backing file read tracking
*/
struct cachefiles_one_read {
- wait_queue_t monitor; /* link into monitored waitqueue */
+ wait_queue_entry_t monitor; /* link into monitored waitqueue */
struct page *back_page; /* backing file page we're waiting for */
struct page *netfs_page; /* netfs page we're going to fill */
struct fscache_retrieval *op; /* retrieval op covering this */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..3978b324cbca 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -204,7 +204,7 @@ wait_for_old_object:
wait_queue_head_t *wq;
signed long timeout = 60 * HZ;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
bool requeue;
/* if the object we're waiting for is queued for processing,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index afbdc418966d..18d7aa61ef0f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -21,7 +21,7 @@
* - we use this to detect read completion of backing pages
* - the caller holds the waitqueue lock
*/
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
int sync, void *_key)
{
struct cachefiles_one_read *monitor =
@@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
}
/* remove from the waitqueue */
- list_del(&wait->task_list);
+ list_del(&wait->entry);
/* move onto the action list and queue for FS-Cache thread pool */
ASSERT(monitor->op);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 987044bca1c2..59cb307b15fb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
if (new_mode != old_mode) {
+ newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
ret = __ceph_setattr(inode, &newattrs);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9ecb2fd348cb..1e71e6ca5ddf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req)
bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc);
- if (rc < 0)
+ if (rc < 0) {
mapping_set_error(mapping, rc);
+ ceph_set_error_write(ci);
+ } else {
+ ceph_clear_error_write(ci);
+ }
/*
* We lost the cache cap, need to truncate the page before
@@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req)
clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
- if (rc < 0)
- SetPageError(page);
-
ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_abort_on_full = true;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 68c78be19d5b..a3ebb632294e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
void *p;
size_t extra_len;
struct timespec zerotime = {0};
+ struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg)
ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
- /* osd_epoch_barrier (version 5) */
- ceph_encode_32(&p, 0);
+ /*
+ * osd_epoch_barrier (version 5)
+ * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
+ * case it was recently changed
+ */
+ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
/* oldest_flush_tid (version 6) */
ceph_encode_64(&p, arg->oldest_flush_tid);
@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
first_tid = cf->tid + 1;
capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
dout("__flush_snaps %p capsnap %p tid %llu %s\n",
@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
inode, capsnap, cf->tid,
ceph_cap_string(capsnap->dirty));
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
if (le16_to_cpu(msg->hdr.version) >= 8) {
u64 flush_tid;
u32 caller_uid, caller_gid;
- u32 osd_epoch_barrier;
u32 pool_ns_len;
- /* version >= 5 */
- ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+
/* version >= 6 */
ceph_decode_64_safe(&p, end, flush_tid, bad);
/* version >= 7 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3ef11bc8d728..4e2d112c982f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_fs_client *fsc = s->private;
+ struct ceph_mdsmap *mdsmap;
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
- seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
- seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- fsc->mdsc->mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- fsc->mdsc->mdsmap->m_session_autoclose);
- for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &fsc->mdsc->mdsmap->m_info[i].addr;
- int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+ mdsmap = fsc->mdsc->mdsmap;
+ seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", mdsmap->m_root);
+ seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+ seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+ for (i = 0; i < mdsmap->m_num_mds; i++) {
+ struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+ int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e9ad501addf..e071d23f6148 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_client *mdsc = fsc->mdsc;
int i;
int err;
- u32 ftype;
+ unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -341,7 +341,6 @@ more:
/* do we have the correct frag content buffered? */
if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req;
- unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -352,8 +351,11 @@ more:
}
if (is_hash_order(ctx->pos)) {
- frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
- NULL, NULL);
+ /* fragtree isn't always accurate. choose frag
+ * based on previous reply when possible. */
+ if (frag == (unsigned)-1)
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
} else {
frag = fpos_frag(ctx->pos);
}
@@ -378,7 +380,11 @@ more:
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ } else if (is_hash_order(ctx->pos)) {
+ req->r_args.readdir.offset_hash =
+ cpu_to_le32(fpos_hash(ctx->pos));
}
+
req->r_dir_release_cnt = fi->dir_release_count;
req->r_dir_ordered_cnt = fi->dir_ordered_count;
req->r_readdir_cache_idx = fi->readdir_cache_idx;
@@ -476,6 +482,7 @@ more:
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
ino_t ino;
+ u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -498,15 +505,17 @@ more:
ctx->pos++;
}
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+
if (fi->next_offset > 2) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ frag = fi->frag;
goto more;
}
/* more frags? */
if (!ceph_frag_is_rightmost(fi->frag)) {
- unsigned frag = ceph_frag_next(fi->frag);
+ frag = ceph_frag_next(fi->frag);
if (is_hash_order(ctx->pos)) {
loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
fi->next_offset, true);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e8f11fa565c5..7df550c13d7f 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
ceph_mdsc_put_request(req);
if (!inode)
return ERR_PTR(-ESTALE);
+ if (inode->i_nlink == 0) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
}
return d_obtain_alias(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..29308a80d66f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -13,6 +13,38 @@
#include "mds_client.h"
#include "cache.h"
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+ u32 wire_flags = 0;
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ wire_flags |= CEPH_O_RDONLY;
+ break;
+ case O_WRONLY:
+ wire_flags |= CEPH_O_WRONLY;
+ break;
+ case O_RDWR:
+ wire_flags |= CEPH_O_RDWR;
+ break;
+ }
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+ ceph_sys2wire(O_CREAT);
+ ceph_sys2wire(O_EXCL);
+ ceph_sys2wire(O_TRUNC);
+ ceph_sys2wire(O_DIRECTORY);
+ ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+ if (flags)
+ dout("unused open flags: %x", flags);
+
+ return cpu_to_le32(wire_flags);
+}
+
/*
* Ceph file operations
*
@@ -74,12 +106,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
(PAGE_SIZE - 1);
npages = calc_pages_for(align, nbytes);
- pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
- if (!pages) {
- pages = vmalloc(sizeof(*pages) * npages);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- }
+ pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
for (idx = 0; idx < npages; ) {
size_t start;
@@ -123,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
@@ -192,7 +221,7 @@ int ceph_renew_caps(struct inode *inode)
spin_lock(&ci->i_ceph_lock);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
- (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+ (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
@@ -781,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
+ req->r_abort_on_full = true;
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
@@ -1088,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
out:
ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
-
- if (pos > i_size_read(inode)) {
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
- }
- } else
+ if (ret != 0) {
+ ceph_set_error_write(ci);
break;
+ }
+
+ ceph_clear_error_write(ci);
+ pos += len;
+ written += len;
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+
}
if (ret != -EOLDSNAPC && written > 0) {
@@ -1306,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
retry_snap:
+ /* FIXME: not complete since it doesn't account for being at quota */
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
goto out;
@@ -1327,7 +1361,8 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
+ (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
inode_unlock(inode);
@@ -1636,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode,
}
size = i_size_read(inode);
- if (!(mode & FALLOC_FL_KEEP_SIZE))
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
endoff = offset + length;
+ ret = inode_newsize_ok(inode, endoff);
+ if (ret)
+ goto unlock;
+ }
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d3119fe3ab45..4de6cdddf059 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session);
- if (rinfo->hash_order && req->r_path2) {
- last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
- req->r_path2, strlen(req->r_path2));
- last_hash = ceph_frag_value(last_hash);
+ if (rinfo->hash_order) {
+ if (req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2,
+ strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ } else if (rinfo->offset_hash) {
+ /* mds understands offset_hash */
+ WARN_ON_ONCE(req->r_readdir_offset != 2);
+ last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ }
}
if (rinfo->dir_dir &&
@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
- !(rinfo->hash_order && req->r_path2)) {
+ !(rinfo->hash_order && last_hash)) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
@@ -2015,7 +2022,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
- inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
@@ -2037,7 +2043,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -2060,7 +2065,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = attr->ia_ctime;
}
release &= issued;
@@ -2078,6 +2083,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
+ req->r_stamp = attr->ia_ctime;
err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c681762d76e6..0c05df44cc6c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
}
if (num == 0)
goto done;
@@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s)
static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
{
- if (atomic_inc_not_zero(&s->s_ref)) {
+ if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
return s;
} else {
dout("mdsc get_session %p 0 -- FAIL", s);
@@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
void ceph_put_mds_session(struct ceph_mds_session *s)
{
dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
+ refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
+ if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
kfree(s);
@@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
return NULL;
session = mdsc->sessions[mds];
dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
+ refcount_read(&session->s_ref));
get_session(session);
return session;
}
@@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_caps);
s->s_nr_caps = 0;
s->s_trim_caps = 0;
- atomic_set(&s->s_ref, 1);
+ refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0;
@@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
}
mdsc->sessions[mds] = s;
atomic_inc(&mdsc->num_sessions);
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
@@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return;
mi = &mdsc->mdsmap->m_info[mds];
@@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
struct ceph_cap *cap;
LIST_HEAD(tmp_list);
int num_cap_releases;
+ __le32 barrier, *cap_barrier;
+
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
spin_lock(&session->s_cap_lock);
again:
@@ -1571,7 +1578,11 @@ again:
head = msg->front.iov_base;
head->num = cpu_to_le32(0);
msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
}
+
cap = list_first_entry(&tmp_list, struct ceph_cap,
session_caps);
list_del(&cap->session_caps);
@@ -1589,6 +1600,11 @@ again:
ceph_put_cap(mdsc, cap);
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1604,6 +1620,11 @@ again:
spin_unlock(&session->s_cap_lock);
if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1684,7 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_stamp = current_fs_time(mdsc->fsc->sb);
+ req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran);
req->r_op = op;
req->r_direct_mode = mode;
@@ -1991,7 +2012,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- atomic_inc(&pagelist->refcnt);
+ refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -2638,8 +2659,10 @@ static void handle_session(struct ceph_mds_session *session,
seq = le64_to_cpu(h->seq);
mutex_lock(&mdsc->mutex);
- if (op == CEPH_SESSION_CLOSE)
+ if (op == CEPH_SESSION_CLOSE) {
+ get_session(session);
__unregister_session(mdsc, session);
+ }
/* FIXME: this ttl calculation is generous */
session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
mutex_unlock(&mdsc->mutex);
@@ -2728,6 +2751,8 @@ static void handle_session(struct ceph_mds_session *session,
kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex);
}
+ if (op == CEPH_SESSION_CLOSE)
+ ceph_put_mds_session(session);
return;
bad:
@@ -3107,7 +3132,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i] == NULL)
continue;
s = mdsc->sessions[i];
@@ -3121,15 +3146,33 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_max_mds ||
+ if (i >= newmap->m_num_mds ||
memcmp(ceph_mdsmap_get_addr(oldmap, i),
ceph_mdsmap_get_addr(newmap, i),
sizeof(struct ceph_entity_addr))) {
if (s->s_state == CEPH_MDS_SESSION_OPENING) {
/* the session never opened, just close it
* out now */
+ get_session(s);
+ __unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting);
+ ceph_put_mds_session(s);
+ } else if (i >= newmap->m_num_mds) {
+ /* force close session for stopped mds */
+ get_session(s);
__unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
} else {
/* just close it */
mutex_unlock(&mdsc->mutex);
@@ -3167,7 +3210,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
- for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -3881,7 +3924,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+ dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
return con;
}
dout("mdsc con_get %p FAIL\n", s);
@@ -3892,7 +3935,7 @@ static void con_put(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
+ dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ac0475a2daa7..db57ae98ed34 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -7,6 +7,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/refcount.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
@@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
- bool dir_complete;
bool dir_end;
+ bool dir_complete;
bool hash_order;
+ bool offset_hash;
struct ceph_mds_reply_dir_entry *dir_entries;
};
@@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
+ sizeof(struct ceph_mds_cap_item))
/*
@@ -156,7 +161,7 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- atomic_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
static inline struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
{
- atomic_inc(&s->s_ref);
+ refcount_inc(&s->s_ref);
return s;
}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 5454e2327a5f..1a748cf88535 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
int i;
/* special case for one mds */
- if (1 == m->m_max_mds && m->m_info[0].state > 0)
+ if (1 == m->m_num_mds && m->m_info[0].state > 0)
return 0;
/* count */
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
if (m->m_info[i].state > 0)
n++;
if (n == 0)
@@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
+ m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+ m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
if (m->m_info == NULL)
goto nomem;
@@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ if (mds < 0 || state <= 0)
continue;
+ if (mds >= m->m_num_mds) {
+ int new_num = max(mds + 1, m->m_num_mds * 2);
+ void *new_m_info = krealloc(m->m_info,
+ new_num * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ m->m_num_mds = new_num;
+ }
+
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
@@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
+ if (m->m_num_mds > m->m_max_mds) {
+ /* find max up mds */
+ for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+ if (i == 0 || m->m_info[i-1].state > 0)
+ break;
+ }
+ m->m_num_mds = i;
+ }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_max_mds) {
+ if (mds >= 0 && mds < m->m_num_mds) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
+
+ if (n > m->m_num_mds) {
+ void *new_m_info = krealloc(m->m_info,
+ n * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ }
+ m->m_num_mds = n;
}
/* inc */
@@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
@@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_num_laggy > 0)
return false;
- for (i = 0; i < m->m_max_mds; i++) {
+ for (i = 0; i < m->m_num_mds; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 8f8b41c2ef0f..dab5d6732345 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- atomic_set(&capsnap->nref, 1);
+ refcount_set(&capsnap->nref, 1);
INIT_LIST_HEAD(&capsnap->ci_item);
capsnap->follows = old_snapc->seq;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a8c81b2052ca..8d7918ce694a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const u64 supported_features =
- CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
- CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
- const u64 required_features = 0;
int page_count;
size_t size;
int err = -ENOMEM;
@@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc)
return ERR_PTR(-ENOMEM);
- fsc->client = ceph_create_client(opt, fsc, supported_features,
- required_features);
+ fsc->client = ceph_create_client(opt, fsc);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 176186b12457..a973acd8beaf 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -14,6 +14,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/posix_acl.h>
+#include <linux/refcount.h>
#include <linux/ceph/libceph.h>
@@ -160,7 +161,7 @@ struct ceph_cap_flush {
* data before flushing the snapped state (tracked here) back to the MDS.
*/
struct ceph_cap_snap {
- atomic_t nref;
+ refcount_t nref;
struct list_head ci_item;
struct ceph_cap_flush cap_flush;
@@ -189,7 +190,7 @@ struct ceph_cap_snap {
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
- if (atomic_dec_and_test(&capsnap->nref)) {
+ if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
@@ -471,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */
+
+/*
+ * We set the ERROR_WRITE bit when we start seeing write errors on an inode
+ * and then clear it when they start succeeding. Note that we do a lockless
+ * check first, and only take the lock if it looks like it needs to be changed.
+ * The write submission code just takes this as a hint, so we're not too
+ * worried if a few slip through in either direction.
+ */
+static inline void ceph_set_error_write(struct ceph_inode_info *ci)
+{
+ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void ceph_clear_error_write(struct ceph_inode_info *ci)
+{
+ if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index febc28f9e2c2..75267cdd5dfd 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (update_xattr) {
int err = 0;
+
if (xattr && (flags & XATTR_CREATE))
err = -EEXIST;
else if (!xattr && (flags & XATTR_REPLACE))
@@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (err) {
kfree(name);
kfree(val);
+ kfree(*newxattr);
return err;
}
if (update_xattr < 0) {
if (xattr)
__remove_xattr(ci, xattr);
kfree(name);
+ kfree(*newxattr);
return 0;
}
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 034f00f21390..afeefe79c25e 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -146,6 +146,15 @@ config CIFS_DEBUG2
option can be turned off unless you are debugging
cifs problems. If unsure, say N.
+config CIFS_DEBUG_DUMP_KEYS
+ bool "Dump encryption keys for offline decryption (Unsafe)"
+ depends on CIFS_DEBUG && CIFS_SMB2
+ help
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+
config CIFS_DFS_UPCALL
bool "DFS feature support"
depends on CIFS && KEYS
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index a0b3e7d1be48..e0445e2075b2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -79,6 +79,10 @@ convert_sfu_char(const __u16 src_char, char *target)
static bool
convert_sfm_char(const __u16 src_char, char *target)
{
+ if (src_char >= 0xF001 && src_char <= 0xF01F) {
+ *target = src_char - 0xF000;
+ return true;
+ }
switch (src_char) {
case SFM_COLON:
*target = ':';
@@ -417,6 +421,10 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
{
__le16 dest_char;
+ if (src_char >= 0x01 && src_char <= 0x1F) {
+ dest_char = cpu_to_le16(src_char + 0xF000);
+ return dest_char;
+ }
switch (src_char) {
case ':':
dest_char = cpu_to_le16(SFM_COLON);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 15bac390dff9..b98436f5c7c7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1135,20 +1135,19 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
u32 acllen = 0;
int rc = 0;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
- struct cifs_tcon *tcon;
+ struct smb_version_operations *ops;
cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- tcon = tlink_tcon(tlink);
- if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
- pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
- &acllen);
- else if (tcon->ses->server->ops->get_acl)
- pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
- &acllen);
+ ops = tlink_tcon(tlink)->ses->server->ops;
+
+ if (pfid && (ops->get_acl_by_fid))
+ pntsd = ops->get_acl_by_fid(cifs_sb, pfid, &acllen);
+ else if (ops->get_acl)
+ pntsd = ops->get_acl(cifs_sb, inode, path, &acllen);
else {
cifs_put_tlink(tlink);
return -EOPNOTSUPP;
@@ -1181,23 +1180,23 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
- struct cifs_tcon *tcon;
+ struct smb_version_operations *ops;
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- tcon = tlink_tcon(tlink);
+
+ ops = tlink_tcon(tlink)->ses->server->ops;
cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
/* Get the security descriptor */
- if (tcon->ses->server->ops->get_acl == NULL) {
+ if (ops->get_acl == NULL) {
cifs_put_tlink(tlink);
return -EOPNOTSUPP;
}
- pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
- &secdesclen);
+ pntsd = ops->get_acl(cifs_sb, inode, path, &secdesclen);
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
@@ -1224,13 +1223,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
- if (tcon->ses->server->ops->set_acl == NULL)
+ if (ops->set_acl == NULL)
rc = -EOPNOTSUPP;
if (!rc) {
/* Set the security descriptor */
- rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
- path, aclflag);
+ rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag);
cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
}
cifs_put_tlink(tlink);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 058ac9b36f04..68abbb0db608 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses)
unsigned char *blobptr;
unsigned char *blobend;
struct ntlmssp2_name *attrptr;
+ struct timespec ts;
if (!ses->auth_key.len || !ses->auth_key.response)
return 0;
@@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses)
blobptr += attrsize; /* advance attr value */
}
- return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ ktime_get_real_ts(&ts);
+ return cpu_to_le64(cifs_UnixTimeToNT(ts));
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8be55be70faf..bcc7d9acad64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -418,7 +418,7 @@ struct smb_version_operations {
int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
const unsigned char *, const unsigned char *, char *,
- size_t, const struct nls_table *, int);
+ size_t, struct cifs_sb_info *);
int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
const char *, const void *, const __u16,
const struct nls_table *, int);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e49958c3f8bb..6eb3147132e3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -480,8 +480,7 @@ extern int CIFSSMBCopy(unsigned int xid,
extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
const unsigned char *searchName,
const unsigned char *ea_name, char *EAData,
- size_t bufsize, const struct nls_table *nls_codepage,
- int remap_special_chars);
+ size_t bufsize, struct cifs_sb_info *cifs_sb);
extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const char *fileName, const char *ea_name,
const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 205fd94f52fd..fbb0d4cbda41 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
* this requirement.
*/
int val, seconds, remain, result;
- struct timespec ts, utc;
- utc = CURRENT_TIME;
+ struct timespec ts;
+ unsigned long utc = ktime_get_real_seconds();
ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
rsp->SrvTime.Time, 0);
cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
- (int)ts.tv_sec, (int)utc.tv_sec,
- (int)(utc.tv_sec - ts.tv_sec));
- val = (int)(utc.tv_sec - ts.tv_sec);
+ (int)ts.tv_sec, (int)utc,
+ (int)(utc - ts.tv_sec));
+ val = (int)(utc - ts.tv_sec);
seconds = abs(val);
result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
remain = seconds % MIN_TZ_ADJ;
@@ -697,9 +697,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(server, 1, CIFS_ECHO_OP);
}
@@ -1599,9 +1597,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(server, 1, 0);
}
@@ -2058,7 +2054,6 @@ cifs_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
- struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
@@ -2095,9 +2090,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, 1, 0);
}
@@ -6076,11 +6069,13 @@ ssize_t
CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
const unsigned char *searchName, const unsigned char *ea_name,
char *EAData, size_t buf_size,
- const struct nls_table *nls_codepage, int remap)
+ struct cifs_sb_info *cifs_sb)
{
/* BB assumes one setup word */
TRANSACTION2_QPI_REQ *pSMB = NULL;
TRANSACTION2_QPI_RSP *pSMBr = NULL;
+ int remap = cifs_remap(cifs_sb);
+ struct nls_table *nls_codepage = cifs_sb->local_nls;
int rc = 0;
int bytes_returned;
int list_len;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6ef78ad838e6..bc09df6b473a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -582,7 +582,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
- down_read(&cinode->lock_sem);
+ down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING);
if (cinode->can_cache_brlcks) {
/* can cache locks - no need to relock */
up_read(&cinode->lock_sem);
@@ -2234,14 +2234,16 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
set_page_writeback(page);
retry_write:
rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
- goto retry_write;
- else if (rc == -EAGAIN)
+ if (rc == -EAGAIN) {
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ goto retry_write;
redirty_page_for_writepage(wbc, page);
- else if (rc != 0)
+ } else if (rc != 0) {
SetPageError(page);
- else
+ mapping_set_error(page->mapping, rc);
+ } else {
SetPageUptodate(page);
+ }
end_page_writeback(page);
put_page(page);
free_xid(xid);
@@ -2810,12 +2812,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc;
+ inode_lock(inode);
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
- inode_lock(inode);
rc = generic_write_checks(iocb, from);
if (rc <= 0)
@@ -2828,11 +2830,11 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
else
rc = -EACCES;
out:
+ up_read(&cinode->lock_sem);
inode_unlock(inode);
if (rc > 0)
rc = generic_write_sync(iocb, rc);
- up_read(&cinode->lock_sem);
return rc;
}
@@ -3271,7 +3273,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (to->type & ITER_IOVEC)
+ if (to->type == ITER_IOVEC)
ctx->should_dirty = true;
rc = setup_aio_ctx_iter(ctx, to, READ);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b261db34103c..a8693632235f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/wait_bit.h>
#include <asm/div64.h>
#include "cifsfs.h"
@@ -322,9 +323,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
- fattr->cf_atime = CURRENT_TIME;
- fattr->cf_ctime = CURRENT_TIME;
- fattr->cf_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&fattr->cf_mtime);
+ fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran);
+ fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
}
@@ -563,8 +564,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
"SETFILEBITS", ea_value, 4 /* size of buf */,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
+ cifs_sb);
cifs_put_tlink(tlink);
if (rc < 0)
return (int)rc;
@@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct cifs_sb_info *cifs_sb, bool adjust_tz,
+ struct super_block *sb, bool adjust_tz,
bool symlink)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
memset(fattr, 0, sizeof(*fattr));
@@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (info->LastAccessTime)
fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
- else
- fattr->cf_atime = CURRENT_TIME;
+ else {
+ ktime_get_real_ts(&fattr->cf_atime);
+ fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran);
+ }
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp)
FILE_ALL_INFO find_data;
struct cifs_fattr fattr;
struct inode *inode = file_inode(filp);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp)
rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
switch (rc) {
case 0:
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+ cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
false);
break;
case -EREMOTE:
@@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
if (!rc) {
- cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
symlink);
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
@@ -1363,9 +1365,9 @@ out_reval:
cifs_inode = CIFS_I(inode);
cifs_inode->time = 0; /* will force revalidate to get info
when needed */
- inode->i_ctime = current_fs_time(sb);
+ inode->i_ctime = current_time(inode);
}
- dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifsInode->time = 0;
d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
rmdir_exit:
kfree(full_path);
@@ -1806,7 +1808,7 @@ unlink_target:
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
- target_dir->i_mtime = current_fs_time(source_dir->i_sb);
+ target_dir->i_mtime = current_time(source_dir);
cifs_rename_exit:
kfree(info_buf_source);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index b08531977daa..3b147dc6af63 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
if (!pages) {
pages = vmalloc(max_pages * sizeof(struct page *));
- if (!bv) {
+ if (!pages) {
kvfree(bv);
return -ENOMEM;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 27bc360c7ffd..a723df3e0197 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid, __u16 search_flags,
struct cifs_search_info *srch_inf)
{
- return CIFSFindFirst(xid, tcon, path, cifs_sb,
- &fid->netfid, search_flags, srch_inf, true);
+ int rc;
+
+ rc = CIFSFindFirst(xid, tcon, path, cifs_sb,
+ &fid->netfid, search_flags, srch_inf, true);
+ if (rc)
+ cifs_dbg(FYI, "find first failed=%d\n", rc);
+ return rc;
}
static int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c58691834eb2..ccbb397debbc 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
kfree(utf16_path);
if (rc) {
- cifs_dbg(VFS, "open dir failed\n");
+ cifs_dbg(FYI, "open dir failed rc=%d\n", rc);
return rc;
}
@@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
fid->volatile_fid, 0, srch_inf);
if (rc) {
- cifs_dbg(VFS, "query directory failed\n");
+ cifs_dbg(FYI, "query directory failed rc=%d\n", rc);
SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
return rc;
@@ -1288,6 +1288,108 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+#ifdef CONFIG_CIFS_ACL
+static struct cifs_ntsd *
+get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
+ const struct cifs_fid *cifsfid, u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ unsigned int xid;
+ int rc = -EOPNOTSUPP;
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+
+ xid = get_xid();
+ cifs_dbg(FYI, "trying to get acl\n");
+
+ rc = SMB2_query_acl(xid, tlink_tcon(tlink), cifsfid->persistent_fid,
+ cifsfid->volatile_fid, (void **)&pntsd, pacllen);
+ free_xid(xid);
+
+ cifs_put_tlink(tlink);
+
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
+ return pntsd;
+
+}
+
+static struct cifs_ntsd *
+get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
+ const char *path, u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ unsigned int xid;
+ int rc;
+ struct cifs_tcon *tcon;
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ __le16 *utf16_path;
+
+ cifs_dbg(FYI, "get smb3 acl for path %s\n", path);
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+
+ tcon = tlink_tcon(tlink);
+ xid = get_xid();
+
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return ERR_PTR(-ENOMEM);
+
+ oparms.tcon = tcon;
+ oparms.desired_access = READ_CONTROL;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ kfree(utf16_path);
+ if (!rc) {
+ rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
+ fid.volatile_fid, (void **)&pntsd, pacllen);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ }
+
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
+ return pntsd;
+}
+
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *
+get_smb2_acl(struct cifs_sb_info *cifs_sb,
+ struct inode *inode, const char *path,
+ u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ struct cifsFileInfo *open_file = NULL;
+
+ if (inode)
+ open_file = find_readable_file(CIFS_I(inode), true);
+ if (!open_file)
+ return get_smb2_acl_by_path(cifs_sb, path, pacllen);
+
+ pntsd = get_smb2_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
+ cifsFileInfo_put(open_file);
+ return pntsd;
+}
+#endif
+
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
@@ -1809,7 +1911,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
sg = init_sg(rqst, sign);
if (!sg) {
- cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc);
+ cifs_dbg(VFS, "%s: Failed to init sg", __func__);
+ rc = -ENOMEM;
goto free_req;
}
@@ -1817,6 +1920,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
iv = kzalloc(iv_len, GFP_KERNEL);
if (!iv) {
cifs_dbg(VFS, "%s: Failed to alloc IV", __func__);
+ rc = -ENOMEM;
goto free_sg;
}
iv[0] = 3;
@@ -2391,6 +2495,11 @@ struct smb_version_operations smb20_operations = {
.dir_needs_close = smb2_dir_needs_close,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+/* .set_acl = set_smb3_acl, */
+#endif /* CIFS_ACL */
};
struct smb_version_operations smb21_operations = {
@@ -2475,6 +2584,11 @@ struct smb_version_operations smb21_operations = {
.enum_snapshots = smb3_enum_snapshots,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+/* .set_acl = set_smb3_acl, */
+#endif /* CIFS_ACL */
};
struct smb_version_operations smb30_operations = {
@@ -2569,6 +2683,11 @@ struct smb_version_operations smb30_operations = {
.receive_transform = smb3_receive_transform,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+/* .set_acl = set_smb3_acl, */
+#endif /* CIFS_ACL */
};
#ifdef CONFIG_CIFS_SMB311
@@ -2751,7 +2870,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 48ff7703b919..4938e8b6d32f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1240,15 +1240,19 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
goto tcon_exit;
}
- if (rsp->ShareType & SMB2_SHARE_TYPE_DISK)
+ switch (rsp->ShareType) {
+ case SMB2_SHARE_TYPE_DISK:
cifs_dbg(FYI, "connection to disk share\n");
- else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) {
+ break;
+ case SMB2_SHARE_TYPE_PIPE:
tcon->ipc = true;
cifs_dbg(FYI, "connection to pipe share\n");
- } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) {
- tcon->print = true;
+ break;
+ case SMB2_SHARE_TYPE_PRINT:
+ tcon->ipc = true;
cifs_dbg(FYI, "connection to printer\n");
- } else {
+ break;
+ default:
cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
rc = -EOPNOTSUPP;
goto tcon_error_exit;
@@ -2077,8 +2081,9 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
static int
query_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, u8 info_class,
- size_t output_len, size_t min_len, void *data)
+ u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type,
+ u32 additional_info, size_t output_len, size_t min_len, void **data,
+ u32 *dlen)
{
struct smb2_query_info_req *req;
struct smb2_query_info_rsp *rsp = NULL;
@@ -2104,10 +2109,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->InfoType = SMB2_O_INFO_FILE;
+ req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ req->AdditionalInformation = cpu_to_le32(additional_info);
/* 4 for rfc1002 length field and 1 for Buffer */
req->InputBufferOffset =
cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
@@ -2126,24 +2132,51 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ if (dlen) {
+ *dlen = le32_to_cpu(rsp->OutputBufferLength);
+ if (!*data) {
+ *data = kmalloc(*dlen, GFP_KERNEL);
+ if (!*data) {
+ cifs_dbg(VFS,
+ "Error %d allocating memory for acl\n",
+ rc);
+ *dlen = 0;
+ goto qinf_exit;
+ }
+ }
+ }
+
rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp->hdr, min_len, data);
+ &rsp->hdr, min_len, *data);
qinf_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
}
+int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data)
+{
+ return query_info(xid, tcon, persistent_fid, volatile_fid,
+ FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+ sizeof(struct smb2_file_all_info), (void **)&data,
+ NULL);
+}
+
int
-SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
- struct smb2_file_all_info *data)
+ void **data, u32 *plen)
{
+ __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
+ *plen = 0;
+
return query_info(xid, tcon, persistent_fid, volatile_fid,
- FILE_ALL_INFORMATION,
- sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- sizeof(struct smb2_file_all_info), data);
+ 0, SMB2_O_INFO_SECURITY, additional_info,
+ SMB2_MAX_BUFFER_SIZE,
+ sizeof(struct smb2_file_all_info), data, plen);
}
int
@@ -2151,9 +2184,10 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid)
{
return query_info(xid, tcon, persistent_fid, volatile_fid,
- FILE_INTERNAL_INFORMATION,
+ FILE_INTERNAL_INFORMATION, SMB2_O_INFO_FILE, 0,
sizeof(struct smb2_file_internal_info),
- sizeof(struct smb2_file_internal_info), uniqueid);
+ sizeof(struct smb2_file_internal_info),
+ (void **)&uniqueid, NULL);
}
/*
@@ -2173,9 +2207,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED)
credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, CIFS_ECHO_OP);
}
@@ -2433,9 +2465,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
queue_work(cifsiod_wq, &rdata->work);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, 0);
}
@@ -2594,7 +2624,6 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
- struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
unsigned int credits_received = 1;
@@ -2634,9 +2663,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
queue_work(cifsiod_wq, &wdata->work);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, credits_received, 0);
}
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 6853454fc871..3595cd755147 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -135,6 +135,9 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct smb2_file_all_info *data);
+extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_file_id, u64 volatile_file_id,
+ void **data, unsigned int *plen);
extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le64 *uniqueid);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index c69ec96e92ac..67367cf1f8cd 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -335,9 +335,31 @@ generate_smb3signingkey(struct cifs_ses *ses,
if (rc)
return rc;
- return generate_key(ses, ptriplet->decryption.label,
- ptriplet->decryption.context,
- ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+ rc = generate_key(ses, ptriplet->decryption.label,
+ ptriplet->decryption.context,
+ ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+
+ if (rc)
+ return rc;
+
+#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
+ cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
+ /*
+ * The session id is opaque in terms of endianness, so we can't
+ * print it as a long long. we dump it as we got it on the wire
+ */
+ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid),
+ &ses->Suid);
+ cifs_dbg(VFS, "Session Key %*ph\n",
+ SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
+ cifs_dbg(VFS, "Signing Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3signingkey);
+ cifs_dbg(VFS, "ServerIn Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey);
+ cifs_dbg(VFS, "ServerOut Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey);
+#endif
+ return rc;
}
int
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 4d64b5b8fc9c..7efbab013957 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -94,7 +94,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
now = jiffies;
/* commands taking longer than one second are indications that
something is wrong, unless it is quite a slow link or server */
- if ((now - midEntry->when_alloc) > HZ) {
+ if (time_after(now, midEntry->when_alloc + HZ)) {
if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) {
pr_debug(" CIFS slow rsp: cmd %d mid %llu",
midEntry->command, midEntry->mid);
@@ -536,11 +536,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
list_add_tail(&mid->qhead, &server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
-
+ /*
+ * Need to store the time in mid before calling I/O. For call_async,
+ * I/O response may come back and free the mid entry on another thread.
+ */
+ cifs_save_when_sent(mid);
cifs_in_send_inc(server);
rc = smb_send_rqst(server, rqst, flags);
cifs_in_send_dec(server);
- cifs_save_when_sent(mid);
if (rc < 0) {
server->sequence_number -= 2;
@@ -613,9 +616,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
}
spin_unlock(&GlobalMid_Lock);
- mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
- mutex_unlock(&server->srv_mutex);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 20af5187ba63..de50e749ff05 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode,
pcreatetime = (__u64 *)value;
*pcreatetime = CIFS_I(inode)->createtime;
return sizeof(__u64);
-
- return rc;
}
@@ -235,8 +233,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
- full_path, name, value, size,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
+ full_path, name, value, size, cifs_sb);
break;
case XATTR_CIFS_ACL: {
@@ -336,8 +333,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
- full_path, NULL, data, buf_size,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
+ full_path, NULL, data, buf_size, cifs_sb);
list_ea_exit:
kfree(full_path);
free_xid(xid);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 9d956cd6d46f..363402fcb3ed 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -34,7 +34,7 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos);
+ return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
}
static ssize_t
@@ -51,7 +51,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
host_file = cfi->cfi_container;
file_start_write(host_file);
inode_lock(coda_inode);
- ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
+ ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 11d087b2b28e..2dd4a7af7dd7 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -739,23 +739,22 @@ static int do_i2c_smbus_ioctl(struct file *file,
unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
- compat_caddr_t datap;
+ union {
+ /* beginnings of those have identical layouts */
+ struct i2c_smbus_ioctl_data32 data32;
+ struct i2c_smbus_ioctl_data data;
+ } v;
tdata = compat_alloc_user_space(sizeof(*tdata));
if (tdata == NULL)
return -ENOMEM;
- if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
- return -EFAULT;
- if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
+ memset(&v, 0, sizeof(v));
+ if (copy_from_user(&v.data32, udata, sizeof(v.data32)))
return -EFAULT;
+ v.data.data = compat_ptr(v.data32.data);
- if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8)))
- return -EFAULT;
- if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32)))
- return -EFAULT;
- if (__get_user(datap, &udata->data) ||
- __put_user(compat_ptr(datap), &tdata->data))
+ if (copy_to_user(tdata, &v.data, sizeof(v.data)))
return -EFAULT;
return do_ioctl(file, cmd, (unsigned long)tdata);
@@ -833,7 +832,7 @@ static int compat_ioctl_preallocate(struct file *file,
*/
#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
-#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
+#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
/* ioctl should not be warned about even if it's not implemented.
Valid reasons to use this:
- It is implemented with ->compat_ioctl on some device, but programs
@@ -866,8 +865,6 @@ COMPATIBLE_IOCTL(TIOCGDEV)
COMPATIBLE_IOCTL(TIOCCBRK)
COMPATIBLE_IOCTL(TIOCGSID)
COMPATIBLE_IOCTL(TIOCGICOUNT)
-COMPATIBLE_IOCTL(TIOCGPKT)
-COMPATIBLE_IOCTL(TIOCGPTLCK)
COMPATIBLE_IOCTL(TIOCGEXCL)
/* Little t */
COMPATIBLE_IOCTL(TIOCGETD)
@@ -883,16 +880,12 @@ COMPATIBLE_IOCTL(TIOCMGET)
COMPATIBLE_IOCTL(TIOCMBIC)
COMPATIBLE_IOCTL(TIOCMBIS)
COMPATIBLE_IOCTL(TIOCMSET)
-COMPATIBLE_IOCTL(TIOCPKT)
COMPATIBLE_IOCTL(TIOCNOTTY)
COMPATIBLE_IOCTL(TIOCSTI)
COMPATIBLE_IOCTL(TIOCOUTQ)
COMPATIBLE_IOCTL(TIOCSPGRP)
COMPATIBLE_IOCTL(TIOCGPGRP)
-COMPATIBLE_IOCTL(TIOCGPTN)
-COMPATIBLE_IOCTL(TIOCSPTLCK)
COMPATIBLE_IOCTL(TIOCSERGETLSR)
-COMPATIBLE_IOCTL(TIOCSIG)
#ifdef TIOCSRS485
COMPATIBLE_IOCTL(TIOCSRS485)
#endif
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 8b2a994042dd..a66f6624d899 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item)
}
EXPORT_SYMBOL(config_item_get);
+struct config_item *config_item_get_unless_zero(struct config_item *item)
+{
+ if (item && kref_get_unless_zero(&item->ci_kref))
+ return item;
+ return NULL;
+}
+EXPORT_SYMBOL(config_item_get_unless_zero);
+
static void config_item_cleanup(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index a6ab012a2c6a..c8aabba502f6 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item,
ret = -ENOMEM;
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
- sl->sl_target = config_item_get(item);
spin_lock(&configfs_dirent_lock);
if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
spin_unlock(&configfs_dirent_lock);
- config_item_put(item);
kfree(sl);
return -ENOENT;
}
+ sl->sl_target = config_item_get(item);
list_add(&sl->sl_list, &target_sd->s_links);
spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 08b46e6e3995..02b7d91c9231 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -7,6 +7,7 @@ config FS_ENCRYPTION
select CRYPTO_XTS
select CRYPTO_CTS
select CRYPTO_CTR
+ select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index a409a84f1bca..6181e9526860 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
goto errout;
}
err = submit_bio_wait(bio);
- if ((err == 0) && bio->bi_error)
+ if (err == 0 && bio->bi_status)
err = -EIO;
bio_put(bio);
if (err)
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6d6eca394d4d..c7835df7e7b8 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/dcache.h>
#include <linux/namei.h>
+#include <crypto/aes.h>
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
@@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
{
struct {
__le64 index;
- u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
- } xts_tweak;
+ u8 padding[FS_IV_SIZE - sizeof(__le64)];
+ } iv;
struct skcipher_request *req = NULL;
DECLARE_FS_COMPLETION_RESULT(ecr);
struct scatterlist dst, src;
@@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
BUG_ON(len == 0);
+ BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
+ BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
+ iv.index = cpu_to_le64(lblk_num);
+ memset(iv.padding, 0, sizeof(iv.padding));
+
+ if (ci->ci_essiv_tfm != NULL) {
+ crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
+ (u8 *)&iv);
+ }
+
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
@@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
page_crypt_complete, &ecr);
- BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
- xts_tweak.index = cpu_to_le64(lblk_num);
- memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
-
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, len, offs);
- skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
+ skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
else
@@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void)
destroy_workqueue(fscrypt_read_workqueue);
kmem_cache_destroy(fscrypt_ctx_cachep);
kmem_cache_destroy(fscrypt_info_cachep);
+
+ fscrypt_essiv_cleanup();
}
module_exit(fscrypt_exit);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d1bb02b1ee58..ad9f814fdead 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -453,12 +453,3 @@ errout:
return ret;
}
EXPORT_SYMBOL(fscrypt_setup_filename);
-
-void fscrypt_free_filename(struct fscrypt_name *fname)
-{
- kfree(fname->crypto_buf.name);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-}
-EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 1e1f8a361b75..a1d5021c31ef 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,10 +12,13 @@
#define _FSCRYPT_PRIVATE_H
#include <linux/fscrypt_supp.h>
+#include <crypto/hash.h>
/* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE 16
+#define FS_IV_SIZE 16
#define FS_AES_128_ECB_KEY_SIZE 16
+#define FS_AES_128_CBC_KEY_SIZE 16
+#define FS_AES_128_CTS_KEY_SIZE 16
#define FS_AES_256_GCM_KEY_SIZE 32
#define FS_AES_256_CBC_KEY_SIZE 32
#define FS_AES_256_CTS_KEY_SIZE 32
@@ -54,6 +57,7 @@ struct fscrypt_info {
u8 ci_filename_mode;
u8 ci_flags;
struct crypto_skcipher *ci_ctfm;
+ struct crypto_cipher *ci_essiv_tfm;
u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
};
@@ -87,4 +91,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
gfp_t gfp_flags);
+/* keyinfo.c */
+extern void __exit fscrypt_essiv_cleanup(void);
+
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 179e578b875b..018c588c7ac3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,8 +10,13 @@
#include <keys/user-type.h>
#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <crypto/aes.h>
+#include <crypto/sha.h>
#include "fscrypt_private.h"
+static struct crypto_shash *essiv_hash_tfm;
+
static void derive_crypt_complete(struct crypto_async_request *req, int rc)
{
struct fscrypt_completion_result *ecr = req->data;
@@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
* derive_key_aes() - Derive a key using AES-128-ECB
* @deriving_key: Encryption key used for derivation.
* @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
+ * @derived_raw_key: Derived raw key.
*
* Return: Zero on success; non-zero otherwise.
*/
static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
- u8 source_key[FS_AES_256_XTS_KEY_SIZE],
- u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+ const struct fscrypt_key *source_key,
+ u8 derived_raw_key[FS_MAX_KEY_SIZE])
{
int res = 0;
struct skcipher_request *req = NULL;
@@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
if (res < 0)
goto out;
- sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg,
- FS_AES_256_XTS_KEY_SIZE, NULL);
+ sg_init_one(&src_sg, source_key->raw, source_key->size);
+ sg_init_one(&dst_sg, derived_raw_key, source_key->size);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+ NULL);
res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
@@ -77,7 +82,7 @@ out:
static int validate_user_key(struct fscrypt_info *crypt_info,
struct fscrypt_context *ctx, u8 *raw_key,
- const char *prefix)
+ const char *prefix, int min_keysize)
{
char *description;
struct key *keyring_key;
@@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
master_key = (struct fscrypt_key *)ukp->data;
BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
- if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
+ || master_key->size % AES_BLOCK_SIZE != 0) {
printk_once(KERN_WARNING
"%s: key size incorrect: %d\n",
__func__, master_key->size);
res = -ENOKEY;
goto out;
}
- res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+ res = derive_key_aes(ctx->nonce, master_key, raw_key);
out:
up_read(&keyring_key->sem);
key_put(keyring_key);
return res;
}
+static const struct {
+ const char *cipher_str;
+ int keysize;
+} available_modes[] = {
+ [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
+ FS_AES_256_XTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
+ FS_AES_256_CTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
+ FS_AES_128_CBC_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
+ FS_AES_128_CTS_KEY_SIZE },
+};
+
static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
const char **cipher_str_ret, int *keysize_ret)
{
- if (S_ISREG(inode->i_mode)) {
- if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
- *cipher_str_ret = "xts(aes)";
- *keysize_ret = FS_AES_256_XTS_KEY_SIZE;
- return 0;
- }
- pr_warn_once("fscrypto: unsupported contents encryption mode "
- "%d for inode %lu\n",
- ci->ci_data_mode, inode->i_ino);
- return -ENOKEY;
+ u32 mode;
+
+ if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
+ pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
+ inode->i_ino,
+ ci->ci_data_mode, ci->ci_filename_mode);
+ return -EINVAL;
}
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
- if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
- *cipher_str_ret = "cts(cbc(aes))";
- *keysize_ret = FS_AES_256_CTS_KEY_SIZE;
- return 0;
- }
- pr_warn_once("fscrypto: unsupported filenames encryption mode "
- "%d for inode %lu\n",
- ci->ci_filename_mode, inode->i_ino);
- return -ENOKEY;
+ if (S_ISREG(inode->i_mode)) {
+ mode = ci->ci_data_mode;
+ } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ mode = ci->ci_filename_mode;
+ } else {
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return -EINVAL;
}
- pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
- (inode->i_mode & S_IFMT), inode->i_ino);
- return -ENOKEY;
+ *cipher_str_ret = available_modes[mode].cipher_str;
+ *keysize_ret = available_modes[mode].keysize;
+ return 0;
}
static void put_crypt_info(struct fscrypt_info *ci)
@@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci)
return;
crypto_free_skcipher(ci->ci_ctfm);
+ crypto_free_cipher(ci->ci_essiv_tfm);
kmem_cache_free(fscrypt_info_cachep, ci);
}
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+ struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+ /* init hash transform on demand */
+ if (unlikely(!tfm)) {
+ struct crypto_shash *prev_tfm;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+ if (prev_tfm) {
+ crypto_free_shash(tfm);
+ tfm = prev_tfm;
+ }
+ }
+
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ return crypto_shash_digest(desc, key, keysize, salt);
+ }
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+ int keysize)
+{
+ int err;
+ struct crypto_cipher *essiv_tfm;
+ u8 salt[SHA256_DIGEST_SIZE];
+
+ essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(essiv_tfm))
+ return PTR_ERR(essiv_tfm);
+
+ ci->ci_essiv_tfm = essiv_tfm;
+
+ err = derive_essiv_salt(raw_key, keysize, salt);
+ if (err)
+ goto out;
+
+ /*
+ * Using SHA256 to derive the salt/key will result in AES-256 being
+ * used for IV generation. File contents encryption will still use the
+ * configured keysize (AES-128) nevertheless.
+ */
+ err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+ if (err)
+ goto out;
+
+out:
+ memzero_explicit(salt, sizeof(salt));
+ return err;
+}
+
+void __exit fscrypt_essiv_cleanup(void)
+{
+ crypto_free_shash(essiv_hash_tfm);
+}
+
int fscrypt_get_encryption_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
@@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
crypt_info->ci_data_mode = ctx.contents_encryption_mode;
crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_essiv_tfm = NULL;
memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
sizeof(crypt_info->ci_master_key));
@@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (!raw_key)
goto out;
- res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
+ res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
+ keysize);
if (res && inode->i_sb->s_cop->key_prefix) {
int res2 = validate_user_key(crypt_info, &ctx, raw_key,
- inode->i_sb->s_cop->key_prefix);
+ inode->i_sb->s_cop->key_prefix,
+ keysize);
if (res2) {
if (res2 == -ENOKEY)
res = -ENOKEY;
@@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode)
ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
+ pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
+ __func__, res, inode->i_ino);
goto out;
}
crypt_info->ci_ctfm = ctfm;
crypto_skcipher_clear_flags(ctfm, ~0);
crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ /*
+ * if the provided key is longer than keysize, we use the first
+ * keysize bytes of the derived key only
+ */
res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
if (res)
goto out;
+ if (S_ISREG(inode->i_mode) &&
+ crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
+ res = init_essiv_generator(crypt_info, raw_key, keysize);
+ if (res) {
+ pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
+ __func__, res, inode->i_ino);
+ goto out;
+ }
+ }
if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
crypt_info = NULL;
out:
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 210976e7a269..ce07a86200f3 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode,
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
- if (!fscrypt_valid_contents_enc_mode(
- policy->contents_encryption_mode))
- return -EINVAL;
-
- if (!fscrypt_valid_filenames_enc_mode(
- policy->filenames_encryption_mode))
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode))
return -EINVAL;
if (policy->flags & ~FS_POLICY_FLAGS_VALID)
@@ -260,6 +256,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
memcpy(ctx.master_key_descriptor, ci->ci_master_key,
FS_KEY_DESCRIPTOR_SIZE);
get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
res = parent->i_sb->s_cop->set_context(child, &ctx,
sizeof(ctx), fs_data);
if (res)
diff --git a/fs/dax.c b/fs/dax.c
index 43bbd6d1037d..306c2b603fb8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,7 +25,6 @@
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
-#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
@@ -84,7 +83,7 @@ struct exceptional_entry_key {
};
struct wait_exceptional_entry_queue {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct exceptional_entry_key key;
};
@@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
@@ -461,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- int ret = 0;
- void *entry, **slot;
- struct radix_tree_root *page_tree = &mapping->page_tree;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
- slot_locked(mapping, slot))
- goto out;
- if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto out;
- radix_tree_delete(page_tree, index);
- mapping->nrexceptional--;
- ret = 1;
-out:
- spin_unlock_irq(&mapping->tree_lock);
- if (ret)
- dax_wake_mapping_entry_waiter(mapping, index, entry, true);
- return ret;
-}
-
-/*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -509,21 +479,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -531,8 +505,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -807,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev,
}
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
- wb_cache_pmem(kaddr, size);
+ dax_flush(dax_dev, pgoff, kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -817,6 +793,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry);
@@ -857,6 +834,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -877,13 +856,16 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, dax_dev, mapping,
indices[i], pvec.pages[i]);
if (ret < 0) {
- put_dax(dax_dev);
- return ret;
+ mapping_set_error(mapping, ret);
+ goto out;
}
}
+ start_index = indices[pvec.nr - 1] + 1;
}
+out:
put_dax(dax_dev);
- return 0;
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return (ret < 0 ? ret : 0);
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -916,6 +898,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(ret);
*entryp = ret;
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
return vm_insert_mixed(vma, vaddr, pfn);
}
@@ -927,6 +910,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -936,6 +920,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -948,6 +933,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -980,18 +966,19 @@ int __dax_zero_page_range(struct block_device *bdev,
void *kaddr;
pfn_t pfn;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
&pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
- clear_pmem(kaddr + offset, size);
+ memset(kaddr + offset, 0, size);
+ dax_flush(dax_dev, pgoff, kaddr + offset, size);
dax_read_unlock(id);
}
return 0;
@@ -1031,7 +1018,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+ if (iomap->flags & IOMAP_F_NEW) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1070,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(kaddr, map_len, iter);
+ map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ map_len, iter);
else
map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
@@ -1150,34 +1138,50 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto out;
+ }
+
+ /*
+ * It is possible, particularly with mixed reads & writes to private
+ * mappings, that we have raced with a PMD fault that overlaps with
+ * the PTE we need to set up. If so just return and the fault will be
+ * retried.
+ */
+ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+ vmf_ret = VM_FAULT_NOPAGE;
+ goto unlock_entry;
+ }
+
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
- if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
- goto finish_iomap;
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto unlock_entry;
}
-
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
- goto finish_iomap;
+ if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+ error = -EIO; /* fs corruption? */
+ goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
@@ -1199,20 +1203,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
if (error)
- goto error_unlock_entry;
+ goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
- goto unlock_entry;
+ goto finish_iomap;
}
switch (iomap.type) {
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
@@ -1225,7 +1229,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
- goto unlock_entry;
+ goto finish_iomap;
}
/*FALLTHRU*/
default:
@@ -1234,10 +1238,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
break;
}
- error_unlock_entry:
+ error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1252,6 +1254,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1397,6 +1403,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;
/*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto fallback;
+
+ /*
+ * It is possible, particularly with mixed reads & writes to private
+ * mappings, that we have raced with a PTE fault that overlaps with
+ * the PMD we need to set up. If so just return and the fault will be
+ * retried.
+ */
+ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
+ !pmd_devmap(*vmf->pmd)) {
+ result = 0;
+ goto unlock_entry;
+ }
+
+ /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
@@ -1404,21 +1432,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
- goto fallback;
+ goto unlock_entry;
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
- /*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
- */
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
- goto finish_iomap;
-
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1426,7 +1444,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
- goto unlock_entry;
+ break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
@@ -1434,8 +1452,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
break;
}
- unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1451,6 +1467,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/dcache.c b/fs/dcache.c
index 95d71eda8142..7ece68d0d4db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -277,6 +277,33 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
+void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ if (unlikely(dname_external(dentry))) {
+ struct external_name *p = external_name(dentry);
+ atomic_inc(&p->u.count);
+ spin_unlock(&dentry->d_lock);
+ name->name = p->name;
+ } else {
+ memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+ spin_unlock(&dentry->d_lock);
+ name->name = name->inline_name;
+ }
+}
+EXPORT_SYMBOL(take_dentry_name_snapshot);
+
+void release_dentry_name_snapshot(struct name_snapshot *name)
+{
+ if (unlikely(name->name != name->inline_name)) {
+ struct external_name *p;
+ p = container_of(name->name, struct external_name, name[0]);
+ if (unlikely(atomic_dec_and_test(&p->u.count)))
+ kfree_rcu(p, u.head);
+ }
+}
+EXPORT_SYMBOL(release_dentry_name_snapshot);
+
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -419,6 +446,8 @@ static void dentry_lru_add(struct dentry *dentry)
{
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry);
+ else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ dentry->d_flags |= DCACHE_REFERENCED;
}
/**
@@ -779,8 +808,6 @@ repeat:
goto kill_it;
}
- if (!(dentry->d_flags & DCACHE_REFERENCED))
- dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_lockref.count--;
@@ -1494,7 +1521,7 @@ static void check_and_drop(void *_data)
{
struct detach_data *data = _data;
- if (!data->mountpoint && !data->select.found)
+ if (!data->mountpoint && list_empty(&data->select.dispose))
__d_drop(data->select.start);
}
@@ -1536,17 +1563,15 @@ void d_invalidate(struct dentry *dentry)
d_walk(dentry, &data, detach_and_collect, check_and_drop);
- if (data.select.found)
+ if (!list_empty(&data.select.dispose))
shrink_dentry_list(&data.select.dispose);
+ else if (!data.mountpoint)
+ return;
if (data.mountpoint) {
detach_mounts(data.mountpoint);
dput(data.mountpoint);
}
-
- if (!data.mountpoint && !data.select.found)
- break;
-
cond_resched();
}
}
@@ -3548,8 +3573,6 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- unsigned int loop;
-
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
@@ -3561,24 +3584,19 @@ static void __init dcache_init_early(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- HASH_EARLY,
+ HASH_EARLY | HASH_ZERO,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- unsigned int loop;
-
- /*
+ /*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
- * of the dcache.
+ * of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
@@ -3592,14 +3610,11 @@ static void __init dcache_init(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- 0,
+ HASH_ZERO,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
/* SLAB cache for __getname() consumers */
@@ -3610,6 +3625,11 @@ EXPORT_SYMBOL(d_genocide);
void __init vfs_caches_init_early(void)
{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
+ INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
+
dcache_init_early();
inode_init_early();
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 354e2ab62031..6dabc4a10396 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -9,7 +9,7 @@
* 2 as published by the Free Software Foundation.
*
* debugfs is for people to use instead of /proc or /sys.
- * See Documentation/DocBook/filesystems for more details.
+ * See Documentation/filesystems/ for more details.
*
*/
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fd4ec4bb214..a0e4e2f7e0be 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -9,7 +9,7 @@
* 2 as published by the Free Software Foundation.
*
* debugfs is for people to use instead of /proc or /sys.
- * See Documentation/DocBook/kernel-api for more details.
+ * See ./Documentation/core-api/kernel-api.rst for more details.
*
*/
@@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = {
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr debug_files[] = {{""}};
+ static const struct tree_descr debug_files[] = {{""}};
struct debugfs_fs_info *fsi;
int err;
@@ -766,7 +766,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
{
int error;
struct dentry *dentry = NULL, *trap;
- const char *old_name;
+ struct name_snapshot old_name;
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
@@ -781,19 +781,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
goto exit;
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+ take_dentry_name_snapshot(&old_name, old_dentry);
error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
dentry, 0);
if (error) {
- fsnotify_oldname_free(old_name);
+ release_dentry_name_snapshot(&old_name);
goto exit;
}
d_move(old_dentry, dentry);
- fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name,
+ fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name,
d_is_dir(old_dentry),
NULL, old_dentry);
- fsnotify_oldname_free(old_name);
+ release_dentry_name_snapshot(&old_name);
unlock_rename(new_dir, old_dir);
dput(dentry);
return old_dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea77de8..08cf27811e5a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work)
dio_complete(dio, 0, true);
}
-static int dio_bio_complete(struct dio *dio, struct bio *bio);
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
@@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio)
/**
* dio_end_io - handle the end io action for the given bio
* @bio: The direct io bio thats being completed
- * @error: Error if there was one
*
* This is meant to be called by any filesystem that uses their own dio_submit_t
* so that the DIO specific endio actions are dealt with after the filesystem
* has done it's completion work.
*/
-void dio_end_io(struct bio *bio, int error)
+void dio_end_io(struct bio *bio)
{
struct dio *dio = bio->bi_private;
@@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
else
bio->bi_end_io = dio_bio_end_io;
+ bio->bi_write_hint = dio->iocb->ki_hint;
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
@@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio)
/*
* Process one completed BIO. No locks are held.
*/
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
{
struct bio_vec *bvec;
unsigned i;
- int err;
+ blk_status_t err = bio->bi_status;
- if (bio->bi_error)
- dio->io_error = -EIO;
+ if (err) {
+ if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
+ dio->io_error = -EAGAIN;
+ else
+ dio->io_error = -EIO;
+ }
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
- err = bio->bi_error;
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
@@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
set_page_dirty_lock(page);
put_page(page);
}
- err = bio->bi_error;
bio_put(bio);
}
return err;
@@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
bio = dio->bio_list;
dio->bio_list = bio->bi_private;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- ret2 = dio_bio_complete(dio, bio);
+ ret2 = blk_status_to_errno(dio_bio_complete(dio, bio));
if (ret == 0)
ret = ret2;
}
@@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
dio->op_flags = REQ_SYNC | REQ_IDLE;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ dio->op_flags |= REQ_NOWAIT;
} else {
dio->op = REQ_OP_READ;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 68b9fffcb2c8..2fb4eadaa118 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
* This is used to atomically remove a wait queue entry from the eventfd wait
* queue head, and read/reset the counter value.
*/
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt)
{
unsigned long flags;
@@ -215,8 +215,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
*
* Returns %0 if successful, or the following error codes:
*
- * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
- * -ERESTARTSYS : A signal interrupted the wait operation.
+ * - -EAGAIN : The operation would have blocked but @no_wait was non-zero.
+ * - -ERESTARTSYS : A signal interrupted the wait operation.
*
* If @no_wait is zero, the function might sleep until the eventfd internal
* counter becomes greater than zero.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5420767c9b68..b1c8e23ddf65 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -244,7 +244,7 @@ struct eppoll_entry {
* Wait queue item that will be linked to the target file wait
* queue head.
*/
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
@@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p)
return !list_empty(p);
}
-static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait);
}
/* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait)->base;
}
@@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
@@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* can't use __remove_wait_queue(). whead->lock is held by
* the caller.
*/
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
}
spin_lock_irqsave(&ep->lock, flags);
@@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int res = 0, eavail, timed_out = 0;
unsigned long flags;
u64 slack = 0;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
if (timeout > 0) {
diff --git a/fs/exec.c b/fs/exec.c
index 72934df68471..62175cbcc801 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -220,7 +220,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
if (write) {
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
- struct rlimit *rlim;
+ unsigned long ptr_size, limit;
+
+ /*
+ * Since the stack will hold pointers to the strings, we
+ * must account for them as well.
+ *
+ * The size calculation is the entire vma while each arg page is
+ * built, so each time we get here it's calculating how far it
+ * is currently (rather than each call being just the newly
+ * added size from the arg page). As a result, we need to
+ * always add the entire size of the pointers, so that on the
+ * last call to get_arg_page() we'll actually have the entire
+ * correct size.
+ */
+ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ if (ptr_size > ULONG_MAX - size)
+ goto fail;
+ size += ptr_size;
acct_arg_size(bprm, size / PAGE_SIZE);
@@ -232,20 +249,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return page;
/*
- * Limit to 1/4-th the stack size for the argv+env strings.
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
+ * (whichever is smaller) for the argv+env strings.
* This ensures that:
* - the remaining binfmt code will not run out of stack space,
* - the program will have a reasonable amount of stack left
* to work from.
*/
- rlim = current->signal->rlim;
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
- put_page(page);
- return NULL;
- }
+ limit = _STK_LIM / 4 * 3;
+ limit = min(limit, rlimit(RLIMIT_STACK) / 4);
+ if (size > limit)
+ goto fail;
}
return page;
+
+fail:
+ put_page(page);
+ return NULL;
}
static void put_arg_page(struct page *page)
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 42f9a0a0c4ca..98233a97b7b8 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
set_page_dirty(page);
if (IS_DIRSYNC(dir))
- err = write_one_page(page, 1);
+ err = write_one_page(page);
else
unlock_page(page);
@@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
int err;
lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
if (err)
EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
err);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d9650c9508e4..e2709695b177 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
}
if (IS_DIRSYNC(dir)) {
- err = write_one_page(page, 1);
+ err = write_one_page(page);
if (!err)
err = sync_inode_metadata(dir, 1);
} else {
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 03f5ce1d3dbe..23ebb92484c6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,7 +113,7 @@ struct ext2_sb_info {
* of the mount options.
*/
spinlock_t s_lock;
- struct mb_cache *s_mb_cache;
+ struct mb_cache *s_ea_block_cache;
};
static inline spinlock_t *
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b21891a6bfca..d34d32bdc944 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
struct super_block *sb = file->f_mapping->host->i_sb;
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
ret = generic_file_fsync(file, start, end, datasync);
- if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+ if (ret == -EIO)
/* We don't really know where the IO error happened... */
ext2_error(sb, __func__,
"detected IO error when writing metadata buffers");
- ret = -EIO;
- }
return ret;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 26d77f9f8c12..2dcbd5698884 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -817,7 +817,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->bdev = bdev;
iomap->offset = (u64)first_block << blkbits;
if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
else
iomap->dax_dev = NULL;
@@ -841,7 +841,7 @@ static int
ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
- put_dax(iomap->dax_dev);
+ fs_put_dax(iomap->dax_dev);
if (iomap->type == IOMAP_MAPPED &&
written < length &&
(flags & IOMAP_WRITE))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 8ac673c71a36..7b1bc9059863 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -32,6 +32,7 @@
#include <linux/log2.h>
#include <linux/quotaops.h>
#include <linux/uaccess.h>
+#include <linux/dax.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -146,9 +147,9 @@ static void ext2_put_super (struct super_block * sb)
ext2_quota_off_umount(sb);
- if (sbi->s_mb_cache) {
- ext2_xattr_destroy_cache(sbi->s_mb_cache);
- sbi->s_mb_cache = NULL;
+ if (sbi->s_ea_block_cache) {
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
}
if (!(sb->s_flags & MS_RDONLY)) {
struct ext2_super_block *es = sbi->s_es;
@@ -1130,9 +1131,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
#ifdef CONFIG_EXT2_FS_XATTR
- sbi->s_mb_cache = ext2_xattr_create_cache();
- if (!sbi->s_mb_cache) {
- ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ sbi->s_ea_block_cache = ext2_xattr_create_cache();
+ if (!sbi->s_ea_block_cache) {
+ ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
#endif
@@ -1181,8 +1182,8 @@ cantfind_ext2:
sb->s_id);
goto failed_mount;
failed_mount3:
- if (sbi->s_mb_cache)
- ext2_xattr_destroy_cache(sbi->s_mb_cache);
+ if (sbi->s_ea_block_cache)
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fbdb8f171893..1b9b1268d418 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -121,6 +121,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
NULL
};
+#define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache)
+
static inline const struct xattr_handler *
ext2_xattr_handler(int name_index)
{
@@ -150,7 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
size_t name_len, size;
char *end;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -195,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
goto found;
entry = next;
}
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
error = -ENODATA;
goto cleanup;
@@ -208,7 +210,7 @@ found:
le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
goto bad_block;
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
if (buffer) {
error = -ERANGE;
@@ -246,7 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
char *end;
size_t rest = buffer_size;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -281,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
goto bad_block;
entry = next;
}
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
/* list the attribute names */
@@ -493,8 +495,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
* This must happen under buffer lock for
* ext2_xattr_set2() to reliably detect modified block
*/
- mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
- hash, bh->b_blocknr);
+ mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
+ bh->b_blocknr);
/* keep the buffer locked while modifying it. */
} else {
@@ -627,7 +629,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
if (header) {
new_bh = ext2_xattr_cache_find(inode, header);
@@ -655,7 +657,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
- ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ ext2_xattr_cache_insert(ea_block_cache, new_bh);
} else {
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -676,7 +678,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
memcpy(new_bh->b_data, header, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ ext2_xattr_cache_insert(ea_block_cache, new_bh);
ext2_xattr_update_super_block(sb);
}
@@ -721,8 +723,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* This must happen under buffer lock for
* ext2_xattr_set2() to reliably detect freed block
*/
- mb_cache_entry_delete_block(ext2_mb_cache,
- hash, old_bh->b_blocknr);
+ mb_cache_entry_delete(ea_block_cache, hash,
+ old_bh->b_blocknr);
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -795,8 +797,8 @@ ext2_xattr_delete_inode(struct inode *inode)
* This must happen under buffer lock for ext2_xattr_set2() to
* reliably detect freed block
*/
- mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
- hash, bh->b_blocknr);
+ mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
+ bh->b_blocknr);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
get_bh(bh);
bforget(bh);
@@ -897,21 +899,21 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
+ ce = mb_cache_entry_find_first(ea_block_cache, hash);
while (ce) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, ce->e_block);
+ bh = sb_bread(inode->i_sb, ce->e_value);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_cache_find",
"inode %ld: block %ld read error",
- inode->i_ino, (unsigned long) ce->e_block);
+ inode->i_ino, (unsigned long) ce->e_value);
} else {
lock_buffer(bh);
/*
@@ -924,27 +926,27 @@ again:
* entry is still hashed is reliable.
*/
if (hlist_bl_unhashed(&ce->e_hash_list)) {
- mb_cache_entry_put(ext2_mb_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
unlock_buffer(bh);
brelse(bh);
goto again;
} else if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
- (unsigned long) ce->e_block,
+ (unsigned long) ce->e_value,
le32_to_cpu(HDR(bh)->h_refcount),
EXT2_XATTR_REFCOUNT_MAX);
} else if (!ext2_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",
atomic_read(&(bh->b_count)));
- mb_cache_entry_touch(ext2_mb_cache, ce);
- mb_cache_entry_put(ext2_mb_cache, ce);
+ mb_cache_entry_touch(ea_block_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
return bh;
}
unlock_buffer(bh);
brelse(bh);
}
- ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
+ ce = mb_cache_entry_find_next(ea_block_cache, ce);
}
return NULL;
}
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index fd389935ecd1..09441ae07a5b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -4,6 +4,7 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
+#include <linux/quotaops.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
@@ -182,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type)
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
- struct posix_acl *acl)
+ struct posix_acl *acl, int xattr_flags)
{
int name_index;
void *value = NULL;
@@ -217,7 +218,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
error = ext4_xattr_set_handle(handle, inode, name_index, "",
- value, size, 0);
+ value, size, xattr_flags);
kfree(value);
if (!error)
@@ -230,15 +231,23 @@ int
ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
handle_t *handle;
- int error, retries = 0;
+ int error, credits, retries = 0;
+ size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
retry:
- handle = ext4_journal_start(inode, EXT4_HT_XATTR,
- ext4_jbd2_credits_xattr(inode));
+ error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */,
+ &credits);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
- error = __ext4_set_acl(handle, inode, type, acl);
+ error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -263,13 +272,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
if (default_acl) {
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
- default_acl);
+ default_acl, XATTR_CREATE);
posix_acl_release(default_acl);
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
- acl);
+ acl, XATTR_CREATE);
posix_acl_release(acl);
}
return error;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8e8046104f4d..9ebde0cd632e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1114,6 +1114,7 @@ struct ext4_inode_info {
/*
* Mount flags set via mount options or defaults
*/
+#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -1444,6 +1445,8 @@ struct ext4_sb_info {
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
unsigned int s_mb_free_pending;
+ struct list_head s_freed_data_list; /* List of blocks to be freed
+ after commit completed */
/* tunables */
unsigned long s_stripe;
@@ -1516,7 +1519,8 @@ struct ext4_sb_info {
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
- struct mb_cache *s_mb_cache;
+ struct mb_cache *s_ea_block_cache;
+ struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Ratelimit ext4 messages. */
@@ -1797,10 +1801,12 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
- EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2098,6 +2104,12 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+ return IS_NOQUOTA(inode) &&
+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
+
/*
* This structure is stuffed into the struct file's private_data field
* for directories. It is where we put information so that we can do
@@ -2126,6 +2138,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
+/* htree levels for ext4 */
+#define EXT4_HTREE_LEVEL_COMPAT 2
+#define EXT4_HTREE_LEVEL 3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+ return ext4_has_feature_largedir(sb) ?
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -2389,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct
/* ialloc.c */
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
- uid_t *owner, int handle_type,
- unsigned int line_no, int nblocks);
+ uid_t *owner, __u32 i_flags,
+ int handle_type, unsigned int line_no,
+ int nblocks);
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
- 0, 0, 0)
+ i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
type, nblocks) \
__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
- (type), __LINE__, (nblocks))
+ 0, (type), __LINE__, (nblocks))
extern void ext4_free_inode(handle_t *, struct inode *);
@@ -2433,6 +2456,7 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
/* inode.c */
int ext4_inode_is_fast_symlink(struct inode *inode);
@@ -2523,7 +2547,6 @@ extern int ext4_search_dir(struct buffer_head *bh,
int buf_size,
struct inode *dir,
struct ext4_filename *fname,
- const struct qstr *d_name,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(handle_t *handle,
@@ -2705,19 +2728,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
- return ext4_has_feature_gdt_csum(sb) ||
- EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
- return (EXT4_SB(sb)->s_chksum_driver != NULL);
+ return ext4_has_feature_metadata_csum(sb) &&
+ (EXT4_SB(sb)->s_chksum_driver != NULL);
}
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+ return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2757,13 +2781,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}
-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+ struct ext4_inode *raw_inode)
{
- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ if (ext4_has_feature_largedir(sb) ||
+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo);
- else
- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}
static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
@@ -3007,7 +3033,6 @@ extern int htree_inlinedir_to_tree(struct file *dir_file,
int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_filename *fname,
- const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f97611171023..dabad1bc8617 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -77,7 +77,14 @@
#define EXT4_RESERVE_TRANS_BLOCKS 12U
-#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
+/*
+ * Number of credits needed if we need to insert an entry into a
+ * directory. For each new index block, we need 4 blocks (old index
+ * block, new index block, bitmap block, bg summary). For normal
+ * htree directories there are 2 levels; if the largedir feature
+ * enabled it's 3 levels.
+ */
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
@@ -104,20 +111,6 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
-static inline int ext4_jbd2_credits_xattr(struct inode *inode)
-{
- int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
-
- /*
- * In case of inline data, we may push out the data to a block,
- * so we need to reserve credits for this eventuality
- */
- if (ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
- return credits;
-}
-
-
/*
* Ext4 handle operation types -- for logging purposes
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2a97dff87b96..e0a8425ff74d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
static inline int get_default_free_blocks_flags(struct inode *inode)
{
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
else if (ext4_should_journal_data(inode))
return EXT4_FREE_BLOCKS_FORGET;
@@ -3413,13 +3414,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
- struct ext4_extent zero_ex;
+ struct ext4_extent zero_ex1, zero_ex2;
struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth, map_len = map->m_len;
int allocated = 0, max_zeroout = 0;
int err = 0;
- int split_flag = 0;
+ int split_flag = EXT4_EXT_DATA_VALID2;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
@@ -3436,7 +3437,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- zero_ex.ee_len = 0;
+ zero_ex1.ee_len = 0;
+ zero_ex2.ee_len = 0;
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3576,62 +3578,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (ext4_encrypted_inode(inode))
max_zeroout = 0;
- /* If extent is less than s_max_zeroout_kb, zeroout directly */
- if (max_zeroout && (ee_len <= max_zeroout)) {
- err = ext4_ext_zeroout(inode, ex);
- if (err)
- goto out;
- zero_ex.ee_block = ex->ee_block;
- zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
- ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- ext4_ext_mark_initialized(ex);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- goto out;
- }
-
/*
- * four cases:
+ * five cases:
* 1. split the extent into three extents.
- * 2. split the extent into two extents, zeroout the first half.
- * 3. split the extent into two extents, zeroout the second half.
+ * 2. split the extent into two extents, zeroout the head of the first
+ * extent.
+ * 3. split the extent into two extents, zeroout the tail of the second
+ * extent.
* 4. split the extent into two extents with out zeroout.
+ * 5. no splitting needed, just possibly zeroout the head and / or the
+ * tail of the extent.
*/
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (max_zeroout && (allocated > map->m_len)) {
+ if (max_zeroout && (allocated > split_map.m_len)) {
if (allocated <= max_zeroout) {
- /* case 3 */
- zero_ex.ee_block =
- cpu_to_le32(map->m_lblk);
- zero_ex.ee_len = cpu_to_le16(allocated);
- ext4_ext_store_pblock(&zero_ex,
- ext4_ext_pblock(ex) + map->m_lblk - ee_block);
- err = ext4_ext_zeroout(inode, &zero_ex);
+ /* case 3 or 5 */
+ zero_ex1.ee_block =
+ cpu_to_le32(split_map.m_lblk +
+ split_map.m_len);
+ zero_ex1.ee_len =
+ cpu_to_le16(allocated - split_map.m_len);
+ ext4_ext_store_pblock(&zero_ex1,
+ ext4_ext_pblock(ex) + split_map.m_lblk +
+ split_map.m_len - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
goto out;
- split_map.m_lblk = map->m_lblk;
split_map.m_len = allocated;
- } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
- /* case 2 */
- if (map->m_lblk != ee_block) {
- zero_ex.ee_block = ex->ee_block;
- zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ }
+ if (split_map.m_lblk - ee_block + split_map.m_len <
+ max_zeroout) {
+ /* case 2 or 5 */
+ if (split_map.m_lblk != ee_block) {
+ zero_ex2.ee_block = ex->ee_block;
+ zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
ee_block);
- ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_store_pblock(&zero_ex2,
ext4_ext_pblock(ex));
- err = ext4_ext_zeroout(inode, &zero_ex);
+ err = ext4_ext_zeroout(inode, &zero_ex2);
if (err)
goto out;
}
+ split_map.m_len += split_map.m_lblk - ee_block;
split_map.m_lblk = ee_block;
- split_map.m_len = map->m_lblk - ee_block + map->m_len;
allocated = map->m_len;
}
}
@@ -3642,8 +3634,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = 0;
out:
/* If we have gotten a failure, don't zero out status tree */
- if (!err)
- err = ext4_zeroout_es(inode, &zero_ex);
+ if (!err) {
+ err = ext4_zeroout_es(inode, &zero_ex1);
+ if (!err)
+ err = ext4_zeroout_es(inode, &zero_ex2);
+ }
return err ? err : allocated;
}
@@ -4883,6 +4878,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
@@ -5569,6 +5566,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ext4_handle_sync(handle);
inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
@@ -5742,6 +5740,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cefa9835f275..58294c9a7e1d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- inode_lock_shared(inode);
+ if (!inode_trylock_shared(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock_shared(inode);
+ }
/*
* Recheck under inode lock - at this point we are sure it cannot
* change anymore
@@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_dax_write_iter(iocb, from);
#endif
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->private = &overwrite;
/* Check whether we do a DIO overwrite or not */
- if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
- overwrite = 1;
+ if (o_direct && !unaligned_aio) {
+ if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+ if (ext4_should_dioread_nolock(inode))
+ overwrite = 1;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
@@ -257,6 +276,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
int result;
+ handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -264,12 +284,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ } else {
+ down_read(&EXT4_I(inode)->i_mmap_sem);
}
- down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- if (write)
+ if (!IS_ERR(handle))
+ result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
+ else
+ result = VM_FAULT_SIGBUS;
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
+ } else {
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ }
return result;
}
@@ -332,13 +364,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- if (ext4_encrypted_inode(inode)) {
- int err = fscrypt_get_encryption_info(inode);
- if (err)
- return 0;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -422,6 +447,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ret < 0)
return ret;
}
+
+ /* Set the flags to support nowait AIO */
+ filp->f_mode |= FMODE_AIO_NOWAIT;
+
return dquot_file_open(inode, filp);
}
@@ -461,57 +490,37 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
endoff = (loff_t)end_blk << blkbits;
index = startoff >> PAGE_SHIFT;
- end = endoff >> PAGE_SHIFT;
+ end = (endoff - 1) >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
do {
int i, num;
unsigned long nr_pages;
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
(pgoff_t)num);
- if (nr_pages == 0) {
- if (whence == SEEK_DATA)
- break;
-
- BUG_ON(whence != SEEK_HOLE);
- /*
- * If this is the first time to go into the loop and
- * offset is not beyond the end offset, it will be a
- * hole at this offset
- */
- if (lastoff == startoff || lastoff < endoff)
- found = 1;
+ if (nr_pages == 0)
break;
- }
-
- /*
- * If this is the first time to go into the loop and
- * offset is smaller than the first page offset, it will be a
- * hole at this offset.
- */
- if (lastoff == startoff && whence == SEEK_HOLE &&
- lastoff < page_offset(pvec.pages[0])) {
- found = 1;
- break;
- }
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
struct buffer_head *bh, *head;
/*
- * If the current offset is not beyond the end of given
- * range, it will be a hole.
+ * If current offset is smaller than the page offset,
+ * there is a hole at this offset.
*/
- if (lastoff < endoff && whence == SEEK_HOLE &&
- page->index > end) {
+ if (whence == SEEK_HOLE && lastoff < endoff &&
+ lastoff < page_offset(pvec.pages[i])) {
found = 1;
*offset = lastoff;
goto out;
}
+ if (page->index > end)
+ goto out;
+
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping)) {
@@ -551,20 +560,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
unlock_page(page);
}
- /*
- * The no. of pages is less than our desired, that would be a
- * hole in there.
- */
- if (nr_pages < num && whence == SEEK_HOLE) {
- found = 1;
- *offset = lastoff;
+ /* The no. of pages is less than our desired, we are done. */
+ if (nr_pages < num)
break;
- }
index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
+ if (whence == SEEK_HOLE && lastoff < endoff) {
+ found = 1;
+ *offset = lastoff;
+ }
out:
pagevec_release(&pvec);
return found;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index b19436098837..7ec340898598 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start_fsb;
ext4_fsblk_t end_fsb;
+ ext4_fsblk_t bofs;
ext4_fsblk_t eofs;
ext4_group_t start_ag;
ext4_group_t end_ag;
@@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_grpblk_t last_cluster;
int error = 0;
+ bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
eofs = ext4_blocks_count(sbi->s_es);
if (keys[0].fmr_physical >= eofs)
return 0;
+ else if (keys[0].fmr_physical < bofs)
+ keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
start_fsb = keys[0].fmr_physical;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 9d549608fd30..aae2c3971cef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
/*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 98ac2f1f23b3..507bfb3344d4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
* as writing the quota to disk may need the lock as well.
*/
dquot_initialize(inode);
- ext4_xattr_delete_inode(handle, inode);
dquot_free_inode(inode);
dquot_drop(inode);
@@ -743,8 +742,9 @@ out:
*/
struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
- __u32 goal, uid_t *owner, int handle_type,
- unsigned int line_no, int nblocks)
+ __u32 goal, uid_t *owner, __u32 i_flags,
+ int handle_type, unsigned int line_no,
+ int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -766,30 +766,69 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ sb = dir->i_sb;
+ sbi = EXT4_SB(sb);
+
+ if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
+ !(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
if (err)
return ERR_PTR(err);
if (!fscrypt_has_encryption_key(dir))
return ERR_PTR(-ENOKEY);
- if (!handle)
- nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
encrypt = 1;
}
- sb = dir->i_sb;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never
+ * more than 1k. In practice they are under
+ * 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */, NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ }
+
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT4_I(inode);
- sbi = EXT4_SB(sb);
/*
* Initialize owners and quota early so that we don't have to account
@@ -1053,6 +1092,7 @@ got:
/* Don't inherit extent flag from directory, amongst others. */
ei->i_flags =
ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+ ei->i_flags |= i_flags;
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
@@ -1109,13 +1149,15 @@ got:
goto fail_free_drop;
}
- err = ext4_init_acl(handle, inode, dir);
- if (err)
- goto fail_free_drop;
+ if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
+ err = ext4_init_acl(handle, inode, dir);
+ if (err)
+ goto fail_free_drop;
- err = ext4_init_security(handle, inode, dir, qstr);
- if (err)
- goto fail_free_drop;
+ err = ext4_init_security(handle, inode, dir, qstr);
+ if (err)
+ goto fail_free_drop;
+ }
if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bc15c2c17633..7ffa290cbb8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d5dea4c293ef..28c5c3abddb3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
/* Compute min_offs. */
for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
- if (!entry->e_value_block && entry->e_value_size) {
+ if (!entry->e_value_inum && entry->e_value_size) {
size_t offs = le16_to_cpu(entry->e_value_offs);
if (offs < min_offs)
min_offs = offs;
@@ -1627,7 +1627,6 @@ out:
struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_filename *fname,
- const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data)
{
@@ -1649,7 +1648,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
- dir, fname, d_name, 0, res_dir);
+ dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
if (ret < 0)
@@ -1662,7 +1661,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
- dir, fname, d_name, 0, res_dir);
+ dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5834c4d76be8..3c600f02673f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -144,16 +144,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
/*
* Test whether an inode is a fast symlink.
+ * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
-
- if (ext4_has_inline_data(inode))
- return 0;
-
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+ return S_ISLNK(inode->i_mode) && inode->i_size &&
+ (inode->i_size < EXT4_N_BLOCKS * 4);
}
/*
@@ -189,6 +185,8 @@ void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
+ int extra_credits = 3;
+ struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -213,7 +211,8 @@ void ext4_evict_inode(struct inode *inode)
*/
if (inode->i_ino != EXT4_JOURNAL_INO &&
ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_data.nrpages) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -238,8 +237,12 @@ void ext4_evict_inode(struct inode *inode)
* protection against it
*/
sb_start_intwrite(inode->i_sb);
+
+ if (!IS_NOQUOTA(inode))
+ extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+3);
+ ext4_blocks_for_truncate(inode)+extra_credits);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -254,6 +257,16 @@ void ext4_evict_inode(struct inode *inode)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
+ /*
+ * Set inode->i_size to 0 before calling ext4_truncate(). We need
+ * special handling of symlinks here because i_size is used to
+ * determine whether ext4_inode_info->i_data contains symlink data or
+ * block mappings. Setting i_size to 0 will remove its fast symlink
+ * status. Erase i_data so that it becomes a valid empty block map.
+ */
+ if (ext4_inode_is_fast_symlink(inode))
+ memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
@@ -271,25 +284,17 @@ void ext4_evict_inode(struct inode *inode)
}
}
- /*
- * ext4_ext_truncate() doesn't reserve any slop when it
- * restarts journal transactions; therefore there may not be
- * enough credits left in the handle to remove the inode from
- * the orphan list and set the dtime field.
- */
- if (!ext4_handle_has_enough_credits(handle, 3)) {
- err = ext4_journal_extend(handle, 3);
- if (err > 0)
- err = ext4_journal_restart(handle, 3);
- if (err != 0) {
- ext4_warning(inode->i_sb,
- "couldn't extend journal (err %d)", err);
- stop_handle:
- ext4_journal_stop(handle);
- ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
- goto no_delete;
- }
+ /* Remove xattr references. */
+ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
+ extra_credits);
+ if (err) {
+ ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
+stop_handle:
+ ext4_journal_stop(handle);
+ ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
+ ext4_xattr_inode_array_free(ea_inode_array);
+ goto no_delete;
}
/*
@@ -317,6 +322,7 @@ void ext4_evict_inode(struct inode *inode)
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
sb_end_intwrite(inode->i_sb);
+ ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -710,7 +716,7 @@ out_sem:
if (map->m_flags & EXT4_MAP_NEW &&
!(map->m_flags & EXT4_MAP_UNWRITTEN) &&
!(flags & EXT4_GET_BLOCKS_ZERO) &&
- !IS_NOQUOTA(inode) &&
+ !ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode);
@@ -2124,15 +2130,29 @@ static int ext4_writepage(struct page *page,
static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
{
int len;
- loff_t size = i_size_read(mpd->inode);
+ loff_t size;
int err;
BUG_ON(page->index != mpd->first_page);
+ clear_page_dirty_for_io(page);
+ /*
+ * We have to be very careful here! Nothing protects writeback path
+ * against i_size changes and the page can be writeably mapped into
+ * page tables. So an application can be growing i_size and writing
+ * data through mmap while writeback runs. clear_page_dirty_for_io()
+ * write-protects our page in page tables and the page cannot get
+ * written to again until we release page lock. So only after
+ * clear_page_dirty_for_io() we are safe to sample i_size for
+ * ext4_bio_write_page() to zero-out tail of the written page. We rely
+ * on the barrier provided by TestClearPageDirty in
+ * clear_page_dirty_for_io() to make sure i_size is really sampled only
+ * after page tables are updated.
+ */
+ size = i_size_read(mpd->inode);
if (page->index == size >> PAGE_SHIFT)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
- clear_page_dirty_for_io(page);
err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
if (!err)
mpd->wbc->nr_to_write--;
@@ -3412,7 +3432,7 @@ retry:
bdev = inode->i_sb->s_bdev;
iomap->bdev = bdev;
if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
else
iomap->dax_dev = NULL;
iomap->offset = first_block << blkbits;
@@ -3447,7 +3467,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
int blkbits = inode->i_blkbits;
bool truncate = false;
- put_dax(iomap->dax_dev);
+ fs_put_dax(iomap->dax_dev);
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
return 0;
@@ -3629,9 +3649,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
get_block_func = ext4_dio_get_block_unwritten_async;
dio_flags = DIO_LOCKING;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
-#endif
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
get_block_func, ext4_end_io_dio, NULL,
dio_flags);
@@ -3713,7 +3730,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
*/
inode_lock_shared(inode);
ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count);
+ iocb->ki_pos + count - 1);
if (ret)
goto out_unlock;
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
@@ -4207,6 +4224,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
out_dio:
@@ -4699,7 +4718,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- inode->i_size = ext4_isize(raw_inode);
+ inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4833,6 +4852,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
brelse(iloc.bh);
ext4_set_inode_flags(inode);
+
+ if (ei->i_flags & EXT4_EA_INODE_FL) {
+ ext4_xattr_inode_set_class(inode);
+
+ inode_lock(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode_unlock(inode);
+ }
+
unlock_new_inode(inode);
return inode;
@@ -5024,7 +5052,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(raw_inode)) {
+ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
@@ -5274,7 +5302,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
+
+ /* dquot_transfer() calls back ext4_get_inode_usage() which
+ * counts xattr inode references.
+ */
+ down_read(&EXT4_I(inode)->xattr_sem);
error = dquot_transfer(inode, attr);
+ up_read(&EXT4_I(inode)->xattr_sem);
+
if (error) {
ext4_journal_stop(handle);
return error;
@@ -5294,6 +5329,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
int shrink = (attr->ia_size <= inode->i_size);
+ if (ext4_encrypted_inode(inode)) {
+ error = fscrypt_get_encryption_info(inode);
+ if (error)
+ return error;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -5637,8 +5680,9 @@ static int ext4_expand_extra_isize(struct inode *inode,
/* No extended attributes present */
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
- memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
- new_extra_isize);
+ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
+ EXT4_I(inode)->i_extra_isize, 0,
+ new_extra_isize - EXT4_I(inode)->i_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
return 0;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0c21e22acd74..42b3a73143cf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -218,7 +218,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
unsigned int jflag;
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
+ if (ext4_is_quota_file(inode))
goto flags_out;
oldflags = ei->i_flags;
@@ -342,7 +342,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
err = -EPERM;
inode_lock(inode);
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
+ if (ext4_is_quota_file(inode))
goto out_unlock;
err = ext4_get_inode_loc(inode, &iloc);
@@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
if (!IS_ERR(transfer_to[PRJQUOTA])) {
+
+ /* __dquot_transfer() calls back ext4_get_inode_usage() which
+ * counts xattr inode references.
+ */
+ down_read(&EXT4_I(inode)->xattr_sem);
err = __dquot_transfer(inode, transfer_to);
+ up_read(&EXT4_I(inode)->xattr_sem);
dqput(transfer_to[PRJQUOTA]);
if (err)
goto out_dirty;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 36de58a37653..581e357e8406 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -367,8 +367,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_free_data_callback(struct super_block *sb,
- struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -2393,7 +2391,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
return 0;
size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
- new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groupinfo = kvzalloc(size, GFP_KERNEL);
if (!new_groupinfo) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
@@ -2639,6 +2637,7 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_free_pending = 0;
+ INIT_LIST_HEAD(&sbi->s_freed_data_list);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -2782,7 +2781,8 @@ int ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count,
+ struct bio **biop)
{
ext4_fsblk_t discard_block;
@@ -2791,18 +2791,18 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+ if (biop) {
+ return __blkdev_issue_discard(sb->s_bdev,
+ (sector_t)discard_block << (sb->s_blocksize_bits - 9),
+ (sector_t)count << (sb->s_blocksize_bits - 9),
+ GFP_NOFS, 0, biop);
+ } else
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
-/*
- * This function is called by the jbd2 layer once the commit has finished,
- * so we know we can free the blocks that were released with that commit.
- */
-static void ext4_free_data_callback(struct super_block *sb,
- struct ext4_journal_cb_entry *jce,
- int rc)
+static void ext4_free_data_in_buddy(struct super_block *sb,
+ struct ext4_free_data *entry)
{
- struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
@@ -2810,18 +2810,6 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
- if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count);
- if (err && err != -EOPNOTSUPP)
- ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%d failed"
- " with %d", entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count, err);
- }
-
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
@@ -2862,6 +2850,56 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_free_data *entry, *tmp;
+ struct bio *discard_bio = NULL;
+ struct list_head freed_data_list;
+ struct list_head *cut_pos = NULL;
+ int err;
+
+ INIT_LIST_HEAD(&freed_data_list);
+
+ spin_lock(&sbi->s_md_lock);
+ list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
+ if (entry->efd_tid != commit_tid)
+ break;
+ cut_pos = &entry->efd_list;
+ }
+ if (cut_pos)
+ list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
+ cut_pos);
+ spin_unlock(&sbi->s_md_lock);
+
+ if (test_opt(sb, DISCARD)) {
+ list_for_each_entry(entry, &freed_data_list, efd_list) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count,
+ &discard_bio);
+ if (err && err != -EOPNOTSUPP) {
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ } else if (err == -EOPNOTSUPP)
+ break;
+ }
+
+ if (discard_bio)
+ submit_bio_wait(discard_bio);
+ }
+
+ list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
+ ext4_free_data_in_buddy(sb, entry);
+}
+
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -3529,7 +3567,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_set_bits(bitmap, start, len);
preallocated += len;
}
- mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
+ mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3887,7 +3925,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- ext4_error(sb, "Error loading buddy information for %u", group);
+ ext4_warning(sb, "Error %d loading buddy information for %u",
+ err, group);
put_bh(bitmap_bh);
return 0;
}
@@ -4044,10 +4083,11 @@ repeat:
BUG_ON(pa->pa_type != MB_INODE_PA);
group = ext4_get_group_number(sb, pa->pa_pstart);
- err = ext4_mb_load_buddy(sb, group, &e4b);
+ err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_error(sb, "Error loading buddy information for %u",
- group);
+ ext4_error(sb, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
@@ -4303,11 +4343,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
spin_unlock(&lg->lg_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+ int err;
group = ext4_get_group_number(sb, pa->pa_pstart);
- if (ext4_mb_load_buddy(sb, group, &e4b)) {
- ext4_error(sb, "Error loading buddy information for %u",
- group);
+ err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
+ if (err) {
+ ext4_error(sb, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
ext4_lock_group(sb, group);
@@ -4459,7 +4502,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
trace_ext4_request_blocks(ar);
/* Allow to use superuser reservation for quota file */
- if (IS_NOQUOTA(ar->inode))
+ if (ext4_is_quota_file(ar->inode))
ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
@@ -4578,14 +4621,28 @@ out:
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
-static int can_merge(struct ext4_free_data *entry1,
- struct ext4_free_data *entry2)
+static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
+ struct ext4_free_data *entry,
+ struct ext4_free_data *new_entry,
+ struct rb_root *entry_rb_root)
{
- if ((entry1->efd_tid == entry2->efd_tid) &&
- (entry1->efd_group == entry2->efd_group) &&
- ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
- return 1;
- return 0;
+ if ((entry->efd_tid != new_entry->efd_tid) ||
+ (entry->efd_group != new_entry->efd_group))
+ return;
+ if (entry->efd_start_cluster + entry->efd_count ==
+ new_entry->efd_start_cluster) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
+ } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
+ entry->efd_start_cluster) {
+ new_entry->efd_count += entry->efd_count;
+ } else
+ return;
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->efd_list);
+ spin_unlock(&sbi->s_md_lock);
+ rb_erase(&entry->efd_node, entry_rb_root);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
static noinline_for_stack int
@@ -4641,29 +4698,19 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(entry, new_entry) &&
- ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
- new_entry->efd_start_cluster = entry->efd_start_cluster;
- new_entry->efd_count += entry->efd_count;
- rb_erase(node, &(db->bb_free_root));
- kmem_cache_free(ext4_free_data_cachep, entry);
- }
+ ext4_try_merge_freed_extent(sbi, entry, new_entry,
+ &(db->bb_free_root));
}
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(new_entry, entry) &&
- ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
- new_entry->efd_count += entry->efd_count;
- rb_erase(node, &(db->bb_free_root));
- kmem_cache_free(ext4_free_data_cachep, entry);
- }
+ ext4_try_merge_freed_extent(sbi, entry, new_entry,
+ &(db->bb_free_root));
}
- /* Add the extent to transaction's private list */
- new_entry->efd_jce.jce_func = ext4_free_data_callback;
+
spin_lock(&sbi->s_md_lock);
- _ext4_journal_callback_add(handle, &new_entry->efd_jce);
+ list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
return 0;
@@ -4866,7 +4913,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count);
+ err = ext4_issue_discard(sb, block_group, bit, count,
+ NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%lu failed"
@@ -5089,7 +5137,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count, NULL);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -5127,8 +5175,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ret = ext4_mb_load_buddy(sb, group, &e4b);
if (ret) {
- ext4_error(sb, "Error in loading buddy "
- "information for %u", group);
+ ext4_warning(sb, "Error %d loading buddy information for %u",
+ ret, group);
return ret;
}
bitmap = e4b.bd_bitmap;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 2bed62084a8c..009300ee1561 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -78,10 +78,8 @@ do { \
struct ext4_free_data {
- /* MUST be the first member */
- struct ext4_journal_cb_entry efd_jce;
-
- /* ext4_free_data private data starts from here */
+ /* this links the free block information from sb_info */
+ struct list_head efd_list;
/* this links the free block information from group_info */
struct rb_node efd_node;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 364ea4d4a943..cf5181b62df1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
owner[0] = i_uid_read(inode);
owner[1] = i_gid_read(inode);
tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root),
- S_IFREG, NULL, goal, owner);
+ S_IFREG, NULL, goal, owner, 0);
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c992ef2c2f94..9bb36909ec92 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode,
return -EBUSY;
}
- if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+ if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
"not be quota files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b81f7d46f344..13f0cadb1238 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
- return le32_to_cpu(entry->block) & 0x00ffffff;
+ return le32_to_cpu(entry->block) & 0x0fffffff;
}
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
indirect = root->info.indirect_levels;
- if (indirect > 1) {
- ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
- root->info.indirect_levels);
+ if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
+ ext4_warning(dir->i_sb,
+ "Directory (ino: %lu) htree depth %#06x exceed"
+ "supported value", dir->i_ino,
+ ext4_dir_htree_level(dir->i_sb));
+ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(dir->i_sb, "Enable large directory "
+ "feature to access it");
+ }
goto fail;
}
@@ -859,12 +866,19 @@ fail:
static void dx_release(struct dx_frame *frames)
{
+ struct dx_root_info *info;
+ int i;
+
if (frames[0].bh == NULL)
return;
- if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
+ info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ for (i = 0; i <= info->indirect_levels; i++) {
+ if (frames[i].bh == NULL)
+ break;
+ brelse(frames[i].bh);
+ frames[i].bh = NULL;
+ }
}
/*
@@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
{
struct dx_hash_info hinfo;
struct ext4_dir_entry_2 *de;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct inode *dir;
ext4_lblk_t block;
int count = 0;
@@ -1155,12 +1169,11 @@ errout:
static inline int search_dirblock(struct buffer_head *bh,
struct inode *dir,
struct ext4_filename *fname,
- const struct qstr *d_name,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir)
{
return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
- fname, d_name, offset, res_dir);
+ fname, offset, res_dir);
}
/*
@@ -1262,7 +1275,6 @@ static inline bool ext4_match(const struct ext4_filename *fname,
*/
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct inode *dir, struct ext4_filename *fname,
- const struct qstr *d_name,
unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
@@ -1355,7 +1367,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
- ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir,
+ ret = ext4_find_inline_entry(dir, &fname, res_dir,
&has_inline_data);
if (has_inline_data) {
if (inlined)
@@ -1430,11 +1442,11 @@ restart:
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- /* read error, skip block & hope for the best */
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
(unsigned long) block);
brelse(bh);
- goto next;
+ ret = ERR_PTR(-EIO);
+ goto cleanup_and_exit;
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
@@ -1444,10 +1456,11 @@ restart:
EXT4_ERROR_INODE(dir, "checksumming directory "
"block %lu", (unsigned long)block);
brelse(bh);
- goto next;
+ ret = ERR_PTR(-EFSBADCRC);
+ goto cleanup_and_exit;
}
set_buffer_verified(bh);
- i = search_dirblock(bh, dir, &fname, d_name,
+ i = search_dirblock(bh, dir, &fname,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1487,8 +1500,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_frame frames[2], *frame;
- const struct qstr *d_name = fname->usr_fname;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct buffer_head *bh;
ext4_lblk_t block;
int retval;
@@ -1505,7 +1517,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
if (IS_ERR(bh))
goto errout;
- retval = search_dirblock(bh, dir, fname, d_name,
+ retval = search_dirblock(bh, dir, fname,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
if (retval == 1)
@@ -1530,7 +1542,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
bh = NULL;
errout:
- dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
+ dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
dx_release(frames);
return bh;
@@ -1892,7 +1904,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
*/
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1911,7 +1923,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
{
struct buffer_head *bh2;
struct dx_root *root;
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
struct ext4_dir_entry_tail *t;
@@ -2130,13 +2142,16 @@ out:
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- struct dx_frame frames[2], *frame;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
+ int restart;
int err;
+again:
+ restart = 0;
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
@@ -2158,24 +2173,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (err != -ENOSPC)
goto cleanup;
+ err = 0;
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
ext4_lblk_t newblock;
- unsigned icount = dx_get_count(entries);
- int levels = frame - frames;
+ int levels = frame - frames + 1;
+ unsigned int icount;
+ int add_level = 1;
struct dx_entry *entries2;
struct dx_node *node2;
struct buffer_head *bh2;
- if (levels && (dx_get_count(frames->entries) ==
- dx_get_limit(frames->entries))) {
- ext4_warning_inode(dir, "Directory index full!");
+ while (frame > frames) {
+ if (dx_get_count((frame - 1)->entries) <
+ dx_get_limit((frame - 1)->entries)) {
+ add_level = 0;
+ break;
+ }
+ frame--; /* split higher index block */
+ at = frame->at;
+ entries = frame->entries;
+ restart = 1;
+ }
+ if (add_level && levels == ext4_dir_htree_level(sb)) {
+ ext4_warning(sb, "Directory (ino: %lu) index full, "
+ "reach max htree level :%d",
+ dir->i_ino, levels);
+ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
+ ext4_warning(sb, "Large directory feature is "
+ "not enabled on this "
+ "filesystem");
+ }
err = -ENOSPC;
goto cleanup;
}
+ icount = dx_get_count(entries);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
err = PTR_ERR(bh2);
@@ -2190,7 +2225,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
goto journal_error;
- if (levels) {
+ if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@@ -2198,7 +2233,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
- frames[0].bh);
+ (frame - 1)->bh);
if (err)
goto journal_error;
@@ -2214,17 +2249,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block(frames + 0, hash2, newblock);
- dxtrace(dx_show_index("node", frames[1].entries));
+ dx_insert_block((frame - 1), hash2, newblock);
+ dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ (frame - 1)->bh);
+ if (err)
+ goto journal_error;
+ if (restart) {
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ goto journal_error;
+ }
} else {
- dxtrace(printk(KERN_DEBUG
- "Creating second level index...\n"));
+ struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2232,22 +2275,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
- frame->at = at = at - entries + entries2;
- frame->entries = entries = entries2;
- frame->bh = bh2;
- err = ext4_journal_get_write_access(handle,
- frame->bh);
+ dxroot = (struct dx_root *)frames[0].bh->b_data;
+ dxroot->info.indirect_levels += 1;
+ dxtrace(printk(KERN_DEBUG
+ "Creating %d level index...\n",
+ info->indirect_levels));
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
- }
- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
- if (err) {
- ext4_std_error(inode->i_sb, err);
- goto cleanup;
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+ brelse(bh2);
+ restart = 1;
+ goto journal_error;
}
}
de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@@ -2259,10 +2298,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
goto cleanup;
journal_error:
- ext4_std_error(dir->i_sb, err);
+ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
+ /* @restart is true means htree-path has been changed, we need to
+ * repeat dx_probe() to find out valid htree-path
+ */
+ if (restart && err == 0)
+ goto again;
return err;
}
@@ -2299,7 +2343,7 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1a82138ba739..c2fce4478cca 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio)
}
#endif
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
mapping_set_error(page->mapping, -EIO);
}
@@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_error)
+ if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
@@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio)
bdevname(bio->bi_bdev, b),
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
- bio->bi_error)) {
+ bio->bi_status)) {
ext4_finish_bio(bio);
bio_put(bio);
return;
}
bio->bi_end_io = NULL;
- if (bio->bi_error) {
+ if (bio->bi_status) {
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- bio->bi_error, inode->i_ino,
+ bio->bi_status, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
- mapping_set_error(inode->i_mapping, bio->bi_error);
+ mapping_set_error(inode->i_mapping,
+ blk_status_to_errno(bio->bi_status));
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
@@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0;
+ io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -396,6 +398,7 @@ submit_and_retry:
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
+ io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a81b829d56de..40a5497b0f60 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio)
int i;
if (ext4_bio_encrypted(bio)) {
- if (bio->bi_error) {
+ if (bio->bi_status) {
fscrypt_release_ctx(bio->bi_private);
} else {
fscrypt_decrypt_bio_pages(bio->bi_private, bio);
@@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- if (!bio->bi_error) {
+ if (!bio->bi_status) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 96973ee74147..0886fe82e9c4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,6 +37,7 @@
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
+#include <linux/dax.h>
#include <linux/cleancache.h>
#include <linux/uaccess.h>
@@ -372,6 +373,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
+
+ ext4_process_freed_data(sb, txn->t_tid);
+
spin_lock(&sbi->s_md_lock);
while (!list_empty(&txn->t_private_list)) {
jce = list_entry(txn->t_private_list.next,
@@ -847,14 +851,9 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
{
int type;
- if (ext4_has_feature_quota(sb)) {
- dquot_disable(sb, -1,
- DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- } else {
- /* Use our quota_off function to clear inode flags etc. */
- for (type = 0; type < EXT4_MAXQUOTAS; type++)
- ext4_quota_off(sb, type);
- }
+ /* Use our quota_off function to clear inode flags etc. */
+ for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ ext4_quota_off(sb, type);
}
#else
static inline void ext4_quota_off_umount(struct super_block *sb)
@@ -931,9 +930,13 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
- if (sbi->s_mb_cache) {
- ext4_xattr_destroy_cache(sbi->s_mb_cache);
- sbi->s_mb_cache = NULL;
+ if (sbi->s_ea_inode_cache) {
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+ }
+ if (sbi->s_ea_block_cache) {
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
}
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
@@ -1147,7 +1150,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
handle_t *handle = fs_data;
- int res, res2, retries = 0;
+ int res, res2, credits, retries = 0;
+
+ /*
+ * Encrypting the root directory is not allowed because e2fsck expects
+ * lost+found to exist and be unencrypted, and encrypting the root
+ * directory would imply encrypting the lost+found directory as well as
+ * the filename "lost+found" itself.
+ */
+ if (inode->i_ino == EXT4_ROOT_INO)
+ return -EPERM;
res = ext4_convert_inline_data(inode);
if (res)
@@ -1178,9 +1190,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
return res;
}
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
retry:
- handle = ext4_journal_start(inode, EXT4_HT_MISC,
- ext4_jbd2_credits_xattr(inode));
+ res = ext4_xattr_set_credits(inode, len, false /* is_create */,
+ &credits);
+ if (res)
+ return res;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1204,7 +1223,7 @@ retry:
return res;
}
-static int ext4_dummy_context(struct inode *inode)
+static bool ext4_dummy_context(struct inode *inode)
{
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
@@ -1257,16 +1276,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode)
}
static const struct dquot_operations ext4_quota_operations = {
- .get_reserved_space = ext4_get_reserved_space,
- .write_dquot = ext4_write_dquot,
- .acquire_dquot = ext4_acquire_dquot,
- .release_dquot = ext4_release_dquot,
- .mark_dirty = ext4_mark_dquot_dirty,
- .write_info = ext4_write_info,
- .alloc_dquot = dquot_alloc,
- .destroy_dquot = dquot_destroy,
- .get_projid = ext4_get_projid,
- .get_next_id = ext4_get_next_id,
+ .get_reserved_space = ext4_get_reserved_space,
+ .write_dquot = ext4_write_dquot,
+ .acquire_dquot = ext4_acquire_dquot,
+ .release_dquot = ext4_release_dquot,
+ .mark_dirty = ext4_mark_dquot_dirty,
+ .write_info = ext4_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+ .get_projid = ext4_get_projid,
+ .get_inode_usage = ext4_get_inode_usage,
+ .get_next_id = ext4_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1329,7 +1349,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
- Opt_max_dir_size_kb, Opt_nojournal_checksum,
+ Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
};
static const match_table_t tokens = {
@@ -1412,6 +1432,8 @@ static const match_table_t tokens = {
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_nombcache, "nombcache"},
+ {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1619,6 +1641,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_err, 0, 0}
};
@@ -2155,7 +2178,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
return 0;
size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(size, GFP_KERNEL);
if (!new_groups) {
ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
size / (int) sizeof(struct flex_groups));
@@ -3446,7 +3469,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
- if (ext4_has_feature_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb) ||
+ ext4_has_feature_ea_inode(sb)) {
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3468,7 +3492,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb))
+ else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3598,6 +3622,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"The Hurd can't support 64-bit file systems");
goto failed_mount;
}
+
+ /*
+ * ea_inode feature uses l_i_version field which is not
+ * available in HURD_COMPAT mode.
+ */
+ if (ext4_has_feature_ea_inode(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "ea_inode feature is not supported for Hurd");
+ goto failed_mount;
+ }
}
if (IS_EXT2_SB(sb)) {
@@ -3889,7 +3923,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sbi->s_group_desc = kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -3951,7 +3985,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
- memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+ memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
@@ -4062,10 +4096,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
no_journal:
- sbi->s_mb_cache = ext4_xattr_create_cache();
- if (!sbi->s_mb_cache) {
- ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
- goto failed_mount_wq;
+ if (!test_opt(sb, NO_MBCACHE)) {
+ sbi->s_ea_block_cache = ext4_xattr_create_cache();
+ if (!sbi->s_ea_block_cache) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to create ea_block_cache");
+ goto failed_mount_wq;
+ }
+
+ if (ext4_has_feature_ea_inode(sb)) {
+ sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+ if (!sbi->s_ea_inode_cache) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to create ea_inode_cache");
+ goto failed_mount_wq;
+ }
+ }
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4297,9 +4343,13 @@ failed_mount4:
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
- if (sbi->s_mb_cache) {
- ext4_xattr_destroy_cache(sbi->s_mb_cache);
- sbi->s_mb_cache = NULL;
+ if (sbi->s_ea_inode_cache) {
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+ }
+ if (sbi->s_ea_block_cache) {
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
}
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -4958,6 +5008,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
}
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
+ ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
"dax flag with busy inodes while remounting");
@@ -5484,7 +5540,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
goto out;
err = dquot_quota_off(sb, type);
- if (err)
+ if (err || ext4_has_feature_quota(sb))
goto out_put;
inode_lock(inode);
@@ -5504,6 +5560,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
out_unlock:
inode_unlock(inode);
out_put:
+ lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
iput(inode);
return err;
out:
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d74dc5f81a04..48c7a7d55ed3 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a,
int ret;
ret = kstrtoull(skip_spaces(buf), 0, &val);
- if (!ret || val >= clusters)
+ if (ret || val >= clusters)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, val);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8fb7ce14e6eb..cff4f41ced61 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -72,12 +72,14 @@
# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
-static struct buffer_head *ext4_xattr_cache_find(struct inode *,
- struct ext4_xattr_header *,
- struct mb_cache_entry **);
-static void ext4_xattr_rehash(struct ext4_xattr_header *,
- struct ext4_xattr_entry *);
+static void ext4_xattr_block_cache_insert(struct mb_cache *,
+ struct buffer_head *);
+static struct buffer_head *
+ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
+ struct mb_cache_entry **);
+static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
+ size_t value_count);
+static void ext4_xattr_rehash(struct ext4_xattr_header *);
static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
@@ -104,8 +106,22 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
-#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
- inode->i_sb->s_fs_info)->s_mb_cache)
+#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_ea_block_cache)
+
+#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_ea_inode_cache)
+
+static int
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+ struct inode *inode);
+
+#ifdef CONFIG_LOCKDEP
+void ext4_xattr_inode_set_class(struct inode *ea_inode)
+{
+ lockdep_set_subclass(&ea_inode->i_rwsem, 1);
+}
+#endif
static __le32 ext4_xattr_block_csum(struct inode *inode,
sector_t block_nr,
@@ -177,9 +193,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
/* Check the values */
while (!IS_LAST_ENTRY(entry)) {
- if (entry->e_value_block != 0)
- return -EFSCORRUPTED;
- if (entry->e_value_size != 0) {
+ if (entry->e_value_size != 0 &&
+ entry->e_value_inum == 0) {
u16 offs = le16_to_cpu(entry->e_value_offs);
u32 size = le32_to_cpu(entry->e_value_size);
void *value;
@@ -269,6 +284,185 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
return cmp ? -ENODATA : 0;
}
+static u32
+ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
+{
+ return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+}
+
+static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
+{
+ return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+ ((u32)ea_inode->i_version);
+}
+
+static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
+{
+ ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+ ea_inode->i_version = (u32)ref_count;
+}
+
+static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
+{
+ return (u32)ea_inode->i_atime.tv_sec;
+}
+
+static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
+{
+ ea_inode->i_atime.tv_sec = hash;
+}
+
+/*
+ * Read the EA value from an inode.
+ */
+static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
+{
+ unsigned long block = 0;
+ struct buffer_head *bh;
+ int blocksize = ea_inode->i_sb->s_blocksize;
+ size_t csize, copied = 0;
+ void *copy_pos = buf;
+
+ while (copied < size) {
+ csize = (size - copied) > blocksize ? blocksize : size - copied;
+ bh = ext4_bread(NULL, ea_inode, block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ if (!bh)
+ return -EFSCORRUPTED;
+
+ memcpy(copy_pos, bh->b_data, csize);
+ brelse(bh);
+
+ copy_pos += csize;
+ block += 1;
+ copied += csize;
+ }
+ return 0;
+}
+
+static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+ struct inode **ea_inode)
+{
+ struct inode *inode;
+ int err;
+
+ inode = ext4_iget(parent->i_sb, ea_ino);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ext4_error(parent->i_sb,
+ "error while reading EA inode %lu err=%d", ea_ino,
+ err);
+ return err;
+ }
+
+ if (is_bad_inode(inode)) {
+ ext4_error(parent->i_sb,
+ "error while reading EA inode %lu is_bad_inode",
+ ea_ino);
+ err = -EIO;
+ goto error;
+ }
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ ext4_error(parent->i_sb,
+ "EA inode %lu does not have EXT4_EA_INODE_FL flag",
+ ea_ino);
+ err = -EINVAL;
+ goto error;
+ }
+
+ *ea_inode = inode;
+ return 0;
+error:
+ iput(inode);
+ return err;
+}
+
+static int
+ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
+ struct ext4_xattr_entry *entry, void *buffer,
+ size_t size)
+{
+ u32 hash;
+
+ /* Verify stored hash matches calculated hash. */
+ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
+ if (hash != ext4_xattr_inode_get_hash(ea_inode))
+ return -EFSCORRUPTED;
+
+ if (entry) {
+ __le32 e_hash, tmp_data;
+
+ /* Verify entry hash. */
+ tmp_data = cpu_to_le32(hash);
+ e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
+ &tmp_data, 1);
+ if (e_hash != entry->e_hash)
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+
+/*
+ * Read xattr value from the EA inode.
+ */
+static int
+ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
+ void *buffer, size_t size)
+{
+ struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+ struct inode *ea_inode;
+ int err;
+
+ err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
+ &ea_inode);
+ if (err) {
+ ea_inode = NULL;
+ goto out;
+ }
+
+ if (i_size_read(ea_inode) != size) {
+ ext4_warning_inode(ea_inode,
+ "ea_inode file size=%llu entry size=%zu",
+ i_size_read(ea_inode), size);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
+ err = ext4_xattr_inode_read(ea_inode, buffer, size);
+ if (err)
+ goto out;
+
+ err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size);
+ /*
+ * Compatibility check for old Lustre ea_inode implementation. Old
+ * version does not have hash validation, but it has a backpointer
+ * from ea_inode to the parent inode.
+ */
+ if (err == -EFSCORRUPTED) {
+ if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
+ ea_inode->i_generation != inode->i_generation) {
+ ext4_warning_inode(ea_inode,
+ "EA inode hash validation failed");
+ goto out;
+ }
+ /* Do not add ea_inode to the cache. */
+ ea_inode_cache = NULL;
+ } else if (err)
+ goto out;
+
+ if (ea_inode_cache)
+ mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
+ ext4_xattr_inode_get_hash(ea_inode),
+ ea_inode->i_ino, true /* reusable */);
+out:
+ iput(ea_inode);
+ return err;
+}
+
static int
ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
@@ -277,7 +471,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct ext4_xattr_entry *entry;
size_t size;
int error;
- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -298,7 +492,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -EFSCORRUPTED;
goto cleanup;
}
- ext4_xattr_cache_insert(ext4_mb_cache, bh);
+ ext4_xattr_block_cache_insert(ea_block_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, 1);
if (error)
@@ -308,8 +502,15 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -ERANGE;
if (size > buffer_size)
goto cleanup;
- memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
- size);
+ if (entry->e_value_inum) {
+ error = ext4_xattr_inode_get(inode, entry, buffer,
+ size);
+ if (error)
+ goto cleanup;
+ } else {
+ memcpy(buffer, bh->b_data +
+ le16_to_cpu(entry->e_value_offs), size);
+ }
}
error = size;
@@ -350,8 +551,15 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
error = -ERANGE;
if (size > buffer_size)
goto cleanup;
- memcpy(buffer, (void *)IFIRST(header) +
- le16_to_cpu(entry->e_value_offs), size);
+ if (entry->e_value_inum) {
+ error = ext4_xattr_inode_get(inode, entry, buffer,
+ size);
+ if (error)
+ goto cleanup;
+ } else {
+ memcpy(buffer, (void *)IFIRST(header) +
+ le16_to_cpu(entry->e_value_offs), size);
+ }
}
error = size;
@@ -428,7 +636,6 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
struct buffer_head *bh = NULL;
int error;
- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -450,7 +657,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
error = -EFSCORRUPTED;
goto cleanup;
}
- ext4_xattr_cache_insert(ext4_mb_cache, bh);
+ ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
@@ -539,15 +746,445 @@ static void ext4_xattr_update_super_block(handle_t *handle,
}
}
+int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
+{
+ struct ext4_iloc iloc = { .bh = NULL };
+ struct buffer_head *bh = NULL;
+ struct ext4_inode *raw_inode;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ qsize_t ea_inode_refs = 0;
+ void *end;
+ int ret;
+
+ lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ goto out;
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ ret = xattr_check_inode(inode, header, end);
+ if (ret)
+ goto out;
+
+ for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
+ entry = EXT4_XATTR_NEXT(entry))
+ if (entry->e_value_inum)
+ ea_inode_refs++;
+ }
+
+ if (EXT4_I(inode)->i_file_acl) {
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (ext4_xattr_check_block(inode, bh)) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+ entry = EXT4_XATTR_NEXT(entry))
+ if (entry->e_value_inum)
+ ea_inode_refs++;
+ }
+ *usage = ea_inode_refs + 1;
+ ret = 0;
+out:
+ brelse(iloc.bh);
+ brelse(bh);
+ return ret;
+}
+
+static inline size_t round_up_cluster(struct inode *inode, size_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
+ inode->i_blkbits);
+ size_t mask = ~(cluster_size - 1);
+
+ return (length + cluster_size - 1) & mask;
+}
+
+static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
+{
+ int err;
+
+ err = dquot_alloc_inode(inode);
+ if (err)
+ return err;
+ err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
+ if (err)
+ dquot_free_inode(inode);
+ return err;
+}
+
+static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+{
+ dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
+ dquot_free_inode(inode);
+}
+
+int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
+ struct buffer_head *block_bh, size_t value_len,
+ bool is_create)
+{
+ int credits;
+ int blocks;
+
+ /*
+ * 1) Owner inode update
+ * 2) Ref count update on old xattr block
+ * 3) new xattr block
+ * 4) block bitmap update for new xattr block
+ * 5) group descriptor for new xattr block
+ * 6) block bitmap update for old xattr block
+ * 7) group descriptor for old block
+ *
+ * 6 & 7 can happen if we have two racing threads T_a and T_b
+ * which are each trying to set an xattr on inodes I_a and I_b
+ * which were both initially sharing an xattr block.
+ */
+ credits = 7;
+
+ /* Quota updates. */
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
+
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * so we need to reserve credits for this eventuality
+ */
+ if (inode && ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+
+ /* We are done if ea_inode feature is not enabled. */
+ if (!ext4_has_feature_ea_inode(sb))
+ return credits;
+
+ /* New ea_inode, inode map, block bitmap, group descriptor. */
+ credits += 4;
+
+ /* Data blocks. */
+ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+ /* Indirection block or one level of extent tree. */
+ blocks += 1;
+
+ /* Block bitmap and group descriptor updates for each block. */
+ credits += blocks * 2;
+
+ /* Blocks themselves. */
+ credits += blocks;
+
+ if (!is_create) {
+ /* Dereference ea_inode holding old xattr value.
+ * Old ea_inode, inode map, block bitmap, group descriptor.
+ */
+ credits += 4;
+
+ /* Data blocks for old ea_inode. */
+ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
+
+ /* Indirection block or one level of extent tree for old
+ * ea_inode.
+ */
+ blocks += 1;
+
+ /* Block bitmap and group descriptor updates for each block. */
+ credits += blocks * 2;
+ }
+
+ /* We may need to clone the existing xattr block in which case we need
+ * to increment ref counts for existing ea_inodes referenced by it.
+ */
+ if (block_bh) {
+ struct ext4_xattr_entry *entry = BFIRST(block_bh);
+
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
+ if (entry->e_value_inum)
+ /* Ref count update on ea_inode. */
+ credits += 1;
+ }
+ return credits;
+}
+
+static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
+ int credits, struct buffer_head *bh,
+ bool dirty, bool block_csum)
+{
+ int error;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ if (handle->h_buffer_credits >= credits)
+ return 0;
+
+ error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
+ if (!error)
+ return 0;
+ if (error < 0) {
+ ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
+ return error;
+ }
+
+ if (bh && dirty) {
+ if (block_csum)
+ ext4_xattr_block_csum_set(inode, bh);
+ error = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (error) {
+ ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+ error);
+ return error;
+ }
+ }
+
+ error = ext4_journal_restart(handle, credits);
+ if (error) {
+ ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
+ return error;
+ }
+
+ if (bh) {
+ error = ext4_journal_get_write_access(handle, bh);
+ if (error) {
+ ext4_warning(inode->i_sb,
+ "Get write access failed (error %d)",
+ error);
+ return error;
+ }
+ }
+ return 0;
+}
+
+static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+ int ref_change)
+{
+ struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+ struct ext4_iloc iloc;
+ s64 ref_count;
+ u32 hash;
+ int ret;
+
+ inode_lock(ea_inode);
+
+ ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
+ if (ret) {
+ iloc.bh = NULL;
+ goto out;
+ }
+
+ ref_count = ext4_xattr_inode_get_ref(ea_inode);
+ ref_count += ref_change;
+ ext4_xattr_inode_set_ref(ea_inode, ref_count);
+
+ if (ref_change > 0) {
+ WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
+ ea_inode->i_ino, ref_count);
+
+ if (ref_count == 1) {
+ WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
+ ea_inode->i_ino, ea_inode->i_nlink);
+
+ set_nlink(ea_inode, 1);
+ ext4_orphan_del(handle, ea_inode);
+
+ if (ea_inode_cache) {
+ hash = ext4_xattr_inode_get_hash(ea_inode);
+ mb_cache_entry_create(ea_inode_cache,
+ GFP_NOFS, hash,
+ ea_inode->i_ino,
+ true /* reusable */);
+ }
+ }
+ } else {
+ WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
+ ea_inode->i_ino, ref_count);
+
+ if (ref_count == 0) {
+ WARN_ONCE(ea_inode->i_nlink != 1,
+ "EA inode %lu i_nlink=%u",
+ ea_inode->i_ino, ea_inode->i_nlink);
+
+ clear_nlink(ea_inode);
+ ext4_orphan_add(handle, ea_inode);
+
+ if (ea_inode_cache) {
+ hash = ext4_xattr_inode_get_hash(ea_inode);
+ mb_cache_entry_delete(ea_inode_cache, hash,
+ ea_inode->i_ino);
+ }
+ }
+ }
+
+ ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
+ iloc.bh = NULL;
+ if (ret)
+ ext4_warning_inode(ea_inode,
+ "ext4_mark_iloc_dirty() failed ret=%d", ret);
+out:
+ brelse(iloc.bh);
+ inode_unlock(ea_inode);
+ return ret;
+}
+
+static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
+{
+ return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
+}
+
+static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
+{
+ return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
+}
+
+static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
+ struct ext4_xattr_entry *first)
+{
+ struct inode *ea_inode;
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_entry *failed_entry;
+ unsigned int ea_ino;
+ int err, saved_err;
+
+ for (entry = first; !IS_LAST_ENTRY(entry);
+ entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_inum)
+ continue;
+ ea_ino = le32_to_cpu(entry->e_value_inum);
+ err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ if (err)
+ goto cleanup;
+ err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+ if (err) {
+ ext4_warning_inode(ea_inode, "inc ref error %d", err);
+ iput(ea_inode);
+ goto cleanup;
+ }
+ iput(ea_inode);
+ }
+ return 0;
+
+cleanup:
+ saved_err = err;
+ failed_entry = entry;
+
+ for (entry = first; entry != failed_entry;
+ entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_inum)
+ continue;
+ ea_ino = le32_to_cpu(entry->e_value_inum);
+ err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ if (err) {
+ ext4_warning(parent->i_sb,
+ "cleanup ea_ino %u iget error %d", ea_ino,
+ err);
+ continue;
+ }
+ err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (err)
+ ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
+ err);
+ iput(ea_inode);
+ }
+ return saved_err;
+}
+
+static void
+ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
+ struct buffer_head *bh,
+ struct ext4_xattr_entry *first, bool block_csum,
+ struct ext4_xattr_inode_array **ea_inode_array,
+ int extra_credits, bool skip_quota)
+{
+ struct inode *ea_inode;
+ struct ext4_xattr_entry *entry;
+ bool dirty = false;
+ unsigned int ea_ino;
+ int err;
+ int credits;
+
+ /* One credit for dec ref on ea_inode, one for orphan list addition, */
+ credits = 2 + extra_credits;
+
+ for (entry = first; !IS_LAST_ENTRY(entry);
+ entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_inum)
+ continue;
+ ea_ino = le32_to_cpu(entry->e_value_inum);
+ err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ if (err)
+ continue;
+
+ err = ext4_expand_inode_array(ea_inode_array, ea_inode);
+ if (err) {
+ ext4_warning_inode(ea_inode,
+ "Expand inode array err=%d", err);
+ iput(ea_inode);
+ continue;
+ }
+
+ err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
+ dirty, block_csum);
+ if (err) {
+ ext4_warning_inode(ea_inode, "Ensure credits err=%d",
+ err);
+ continue;
+ }
+
+ err = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (err) {
+ ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
+ err);
+ continue;
+ }
+
+ if (!skip_quota)
+ ext4_xattr_inode_free_quota(parent,
+ le32_to_cpu(entry->e_value_size));
+
+ /*
+ * Forget about ea_inode within the same transaction that
+ * decrements the ref count. This avoids duplicate decrements in
+ * case the rest of the work spills over to subsequent
+ * transactions.
+ */
+ entry->e_value_inum = 0;
+ entry->e_value_size = 0;
+
+ dirty = true;
+ }
+
+ if (dirty) {
+ /*
+ * Note that we are deliberately skipping csum calculation for
+ * the final update because we do not expect any journal
+ * restarts until xattr block is freed.
+ */
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ ext4_warning_inode(parent,
+ "handle dirty metadata err=%d", err);
+ }
+}
+
/*
* Release the xattr block BH: If the reference count is > 1, decrement it;
* otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
- struct buffer_head *bh)
+ struct buffer_head *bh,
+ struct ext4_xattr_inode_array **ea_inode_array,
+ int extra_credits)
{
- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
u32 hash, ref;
int error = 0;
@@ -565,9 +1202,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
+ if (ea_block_cache)
+ mb_cache_entry_delete(ea_block_cache, hash,
+ bh->b_blocknr);
get_bh(bh);
unlock_buffer(bh);
+
+ if (ext4_has_feature_ea_inode(inode->i_sb))
+ ext4_xattr_inode_dec_ref_all(handle, inode, bh,
+ BFIRST(bh),
+ true /* block_csum */,
+ ea_inode_array,
+ extra_credits,
+ true /* skip_quota */);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -577,11 +1224,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
struct mb_cache_entry *ce;
- ce = mb_cache_entry_get(ext4_mb_cache, hash,
- bh->b_blocknr);
- if (ce) {
- ce->e_reusable = 1;
- mb_cache_entry_put(ext4_mb_cache, ce);
+ if (ea_block_cache) {
+ ce = mb_cache_entry_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (ce) {
+ ce->e_reusable = 1;
+ mb_cache_entry_put(ea_block_cache, ce);
+ }
}
}
@@ -620,7 +1269,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (last->e_value_size) {
+ if (!last->e_value_inum && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
@@ -631,113 +1280,454 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
-static int
-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+/*
+ * Write the value of the EA in an inode.
+ */
+static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
+ const void *buf, int bufsize)
+{
+ struct buffer_head *bh = NULL;
+ unsigned long block = 0;
+ int blocksize = ea_inode->i_sb->s_blocksize;
+ int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
+ int csize, wsize = 0;
+ int ret = 0;
+ int retries = 0;
+
+retry:
+ while (ret >= 0 && ret < max_blocks) {
+ struct ext4_map_blocks map;
+ map.m_lblk = block += ret;
+ map.m_len = max_blocks -= ret;
+
+ ret = ext4_map_blocks(handle, ea_inode, &map,
+ EXT4_GET_BLOCKS_CREATE);
+ if (ret <= 0) {
+ ext4_mark_inode_dirty(handle, ea_inode);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+ break;
+ }
+ }
+
+ if (ret < 0)
+ return ret;
+
+ block = 0;
+ while (wsize < bufsize) {
+ if (bh != NULL)
+ brelse(bh);
+ csize = (bufsize - wsize) > blocksize ? blocksize :
+ bufsize - wsize;
+ bh = ext4_getblk(handle, ea_inode, block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (ret)
+ goto out;
+
+ memcpy(bh->b_data, buf, csize);
+ set_buffer_uptodate(bh);
+ ext4_handle_dirty_metadata(handle, ea_inode, bh);
+
+ buf += csize;
+ wsize += csize;
+ block += 1;
+ }
+
+ inode_lock(ea_inode);
+ i_size_write(ea_inode, wsize);
+ ext4_update_i_disksize(ea_inode, wsize);
+ inode_unlock(ea_inode);
+
+ ext4_mark_inode_dirty(handle, ea_inode);
+
+out:
+ brelse(bh);
+
+ return ret;
+}
+
+/*
+ * Create an inode to store the value of a large EA.
+ */
+static struct inode *ext4_xattr_inode_create(handle_t *handle,
+ struct inode *inode, u32 hash)
+{
+ struct inode *ea_inode = NULL;
+ uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
+ int err;
+
+ /*
+ * Let the next inode be the goal, so we try and allocate the EA inode
+ * in the same group, or nearby one.
+ */
+ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ S_IFREG | 0600, NULL, inode->i_ino + 1, owner,
+ EXT4_EA_INODE_FL);
+ if (!IS_ERR(ea_inode)) {
+ ea_inode->i_op = &ext4_file_inode_operations;
+ ea_inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(ea_inode);
+ ext4_xattr_inode_set_class(ea_inode);
+ unlock_new_inode(ea_inode);
+ ext4_xattr_inode_set_ref(ea_inode, 1);
+ ext4_xattr_inode_set_hash(ea_inode, hash);
+ err = ext4_mark_inode_dirty(handle, ea_inode);
+ if (!err)
+ err = ext4_inode_attach_jinode(ea_inode);
+ if (err) {
+ iput(ea_inode);
+ return ERR_PTR(err);
+ }
+
+ /*
+ * Xattr inodes are shared therefore quota charging is performed
+ * at a higher level.
+ */
+ dquot_free_inode(ea_inode);
+ dquot_drop(ea_inode);
+ inode_lock(ea_inode);
+ ea_inode->i_flags |= S_NOQUOTA;
+ inode_unlock(ea_inode);
+ }
+
+ return ea_inode;
+}
+
+static struct inode *
+ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
+ size_t value_len, u32 hash)
+{
+ struct inode *ea_inode;
+ struct mb_cache_entry *ce;
+ struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
+ void *ea_data;
+
+ if (!ea_inode_cache)
+ return NULL;
+
+ ce = mb_cache_entry_find_first(ea_inode_cache, hash);
+ if (!ce)
+ return NULL;
+
+ ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+ if (!ea_data) {
+ mb_cache_entry_put(ea_inode_cache, ce);
+ return NULL;
+ }
+
+ while (ce) {
+ ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+ if (!IS_ERR(ea_inode) &&
+ !is_bad_inode(ea_inode) &&
+ (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
+ i_size_read(ea_inode) == value_len &&
+ !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
+ !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
+ value_len) &&
+ !memcmp(value, ea_data, value_len)) {
+ mb_cache_entry_touch(ea_inode_cache, ce);
+ mb_cache_entry_put(ea_inode_cache, ce);
+ kvfree(ea_data);
+ return ea_inode;
+ }
+
+ if (!IS_ERR(ea_inode))
+ iput(ea_inode);
+ ce = mb_cache_entry_find_next(ea_inode_cache, ce);
+ }
+ kvfree(ea_data);
+ return NULL;
+}
+
+/*
+ * Add value of the EA in an inode.
+ */
+static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+ const void *value, size_t value_len,
+ struct inode **ret_inode)
+{
+ struct inode *ea_inode;
+ u32 hash;
+ int err;
+
+ hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
+ ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
+ if (ea_inode) {
+ err = ext4_xattr_inode_inc_ref(handle, ea_inode);
+ if (err) {
+ iput(ea_inode);
+ return err;
+ }
+
+ *ret_inode = ea_inode;
+ return 0;
+ }
+
+ /* Create an inode for the EA value */
+ ea_inode = ext4_xattr_inode_create(handle, inode, hash);
+ if (IS_ERR(ea_inode))
+ return PTR_ERR(ea_inode);
+
+ err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
+ if (err) {
+ ext4_xattr_inode_dec_ref(handle, ea_inode);
+ iput(ea_inode);
+ return err;
+ }
+
+ if (EA_INODE_CACHE(inode))
+ mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+ ea_inode->i_ino, true /* reusable */);
+
+ *ret_inode = ea_inode;
+ return 0;
+}
+
+/*
+ * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
+ * feature is enabled.
+ */
+#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U)
+
+static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+ struct ext4_xattr_search *s,
+ handle_t *handle, struct inode *inode,
+ bool is_block)
{
struct ext4_xattr_entry *last;
- size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+ struct ext4_xattr_entry *here = s->here;
+ size_t min_offs = s->end - s->base, name_len = strlen(i->name);
+ int in_inode = i->in_inode;
+ struct inode *old_ea_inode = NULL;
+ struct inode *new_ea_inode = NULL;
+ size_t old_size, new_size;
+ int ret;
+
+ /* Space used by old and new values. */
+ old_size = (!s->not_found && !here->e_value_inum) ?
+ EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
+ new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
+
+ /*
+ * Optimization for the simple case when old and new values have the
+ * same padded sizes. Not applicable if external inodes are involved.
+ */
+ if (new_size && new_size == old_size) {
+ size_t offs = le16_to_cpu(here->e_value_offs);
+ void *val = s->base + offs;
+
+ here->e_value_size = cpu_to_le32(i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, new_size);
+ } else {
+ memcpy(val, i->value, i->value_len);
+ /* Clear padding bytes. */
+ memset(val + i->value_len, 0, new_size - i->value_len);
+ }
+ return 0;
+ }
/* Compute min_offs and last. */
last = s->first;
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (last->e_value_size) {
+ if (!last->e_value_inum && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < min_offs)
min_offs = offs;
}
}
- free = min_offs - ((void *)last - s->base) - sizeof(__u32);
- if (!s->not_found) {
- if (s->here->e_value_size) {
- size_t size = le32_to_cpu(s->here->e_value_size);
- free += EXT4_XATTR_SIZE(size);
- }
- free += EXT4_XATTR_LEN(name_len);
- }
+
+ /* Check whether we have enough space. */
if (i->value) {
- if (free < EXT4_XATTR_LEN(name_len) +
- EXT4_XATTR_SIZE(i->value_len))
- return -ENOSPC;
+ size_t free;
+
+ free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+ if (!s->not_found)
+ free += EXT4_XATTR_LEN(name_len) + old_size;
+
+ if (free < EXT4_XATTR_LEN(name_len) + new_size) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * If storing the value in an external inode is an option,
+ * reserve space for xattr entries/names in the external
+ * attribute block so that a long value does not occupy the
+ * whole space and prevent futher entries being added.
+ */
+ if (ext4_has_feature_ea_inode(inode->i_sb) &&
+ new_size && is_block &&
+ (min_offs + old_size - new_size) <
+ EXT4_XATTR_BLOCK_RESERVE(inode)) {
+ ret = -ENOSPC;
+ goto out;
+ }
}
- if (i->value && s->not_found) {
- /* Insert the new name. */
- size_t size = EXT4_XATTR_LEN(name_len);
- size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
- memmove((void *)s->here + size, s->here, rest);
- memset(s->here, 0, size);
- s->here->e_name_index = i->name_index;
- s->here->e_name_len = name_len;
- memcpy(s->here->e_name, i->name, name_len);
- } else {
- if (s->here->e_value_size) {
- void *first_val = s->base + min_offs;
- size_t offs = le16_to_cpu(s->here->e_value_offs);
- void *val = s->base + offs;
- size_t size = EXT4_XATTR_SIZE(
- le32_to_cpu(s->here->e_value_size));
-
- if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
- /* The old and the new value have the same
- size. Just replace. */
- s->here->e_value_size =
- cpu_to_le32(i->value_len);
- if (i->value == EXT4_ZERO_XATTR_VALUE) {
- memset(val, 0, size);
- } else {
- /* Clear pad bytes first. */
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD);
- memcpy(val, i->value, i->value_len);
- }
- return 0;
- }
+ /*
+ * Getting access to old and new ea inodes is subject to failures.
+ * Finish that work before doing any modifications to the xattr data.
+ */
+ if (!s->not_found && here->e_value_inum) {
+ ret = ext4_xattr_inode_iget(inode,
+ le32_to_cpu(here->e_value_inum),
+ &old_ea_inode);
+ if (ret) {
+ old_ea_inode = NULL;
+ goto out;
+ }
+ }
+ if (i->value && in_inode) {
+ WARN_ON_ONCE(!i->value_len);
+
+ ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
+ if (ret)
+ goto out;
+
+ ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
+ i->value_len,
+ &new_ea_inode);
+ if (ret) {
+ new_ea_inode = NULL;
+ ext4_xattr_inode_free_quota(inode, i->value_len);
+ goto out;
+ }
+ }
- /* Remove the old value. */
- memmove(first_val + size, first_val, val - first_val);
- memset(first_val, 0, size);
- s->here->e_value_size = 0;
- s->here->e_value_offs = 0;
- min_offs += size;
-
- /* Adjust all value offsets. */
- last = s->first;
- while (!IS_LAST_ENTRY(last)) {
- size_t o = le16_to_cpu(last->e_value_offs);
- if (last->e_value_size && o < offs)
- last->e_value_offs =
- cpu_to_le16(o + size);
- last = EXT4_XATTR_NEXT(last);
+ if (old_ea_inode) {
+ /* We are ready to release ref count on the old_ea_inode. */
+ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
+ if (ret) {
+ /* Release newly required ref count on new_ea_inode. */
+ if (new_ea_inode) {
+ int err;
+
+ err = ext4_xattr_inode_dec_ref(handle,
+ new_ea_inode);
+ if (err)
+ ext4_warning_inode(new_ea_inode,
+ "dec ref new_ea_inode err=%d",
+ err);
+ ext4_xattr_inode_free_quota(inode,
+ i->value_len);
}
+ goto out;
}
- if (!i->value) {
- /* Remove the old name. */
- size_t size = EXT4_XATTR_LEN(name_len);
- last = ENTRY((void *)last - size);
- memmove(s->here, (void *)s->here + size,
- (void *)last - (void *)s->here + sizeof(__u32));
- memset(last, 0, size);
+
+ ext4_xattr_inode_free_quota(inode,
+ le32_to_cpu(here->e_value_size));
+ }
+
+ /* No failures allowed past this point. */
+
+ if (!s->not_found && here->e_value_offs) {
+ /* Remove the old value. */
+ void *first_val = s->base + min_offs;
+ size_t offs = le16_to_cpu(here->e_value_offs);
+ void *val = s->base + offs;
+
+ memmove(first_val + old_size, first_val, val - first_val);
+ memset(first_val, 0, old_size);
+ min_offs += old_size;
+
+ /* Adjust all value offsets. */
+ last = s->first;
+ while (!IS_LAST_ENTRY(last)) {
+ size_t o = le16_to_cpu(last->e_value_offs);
+
+ if (!last->e_value_inum &&
+ last->e_value_size && o < offs)
+ last->e_value_offs = cpu_to_le16(o + old_size);
+ last = EXT4_XATTR_NEXT(last);
}
}
+ if (!i->value) {
+ /* Remove old name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+
+ last = ENTRY((void *)last - size);
+ memmove(here, (void *)here + size,
+ (void *)last - (void *)here + sizeof(__u32));
+ memset(last, 0, size);
+ } else if (s->not_found) {
+ /* Insert new name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+ size_t rest = (void *)last - (void *)here + sizeof(__u32);
+
+ memmove((void *)here + size, here, rest);
+ memset(here, 0, size);
+ here->e_name_index = i->name_index;
+ here->e_name_len = name_len;
+ memcpy(here->e_name, i->name, name_len);
+ } else {
+ /* This is an update, reset value info. */
+ here->e_value_inum = 0;
+ here->e_value_offs = 0;
+ here->e_value_size = 0;
+ }
+
if (i->value) {
- /* Insert the new value. */
- s->here->e_value_size = cpu_to_le32(i->value_len);
- if (i->value_len) {
- size_t size = EXT4_XATTR_SIZE(i->value_len);
- void *val = s->base + min_offs - size;
- s->here->e_value_offs = cpu_to_le16(min_offs - size);
+ /* Insert new value. */
+ if (in_inode) {
+ here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
+ } else if (i->value_len) {
+ void *val = s->base + min_offs - new_size;
+
+ here->e_value_offs = cpu_to_le16(min_offs - new_size);
if (i->value == EXT4_ZERO_XATTR_VALUE) {
- memset(val, 0, size);
+ memset(val, 0, new_size);
} else {
- /* Clear the pad bytes first. */
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD);
memcpy(val, i->value, i->value_len);
+ /* Clear padding bytes. */
+ memset(val + i->value_len, 0,
+ new_size - i->value_len);
}
}
+ here->e_value_size = cpu_to_le32(i->value_len);
}
- return 0;
+
+ if (i->value) {
+ __le32 hash = 0;
+
+ /* Entry hash calculation. */
+ if (in_inode) {
+ __le32 crc32c_hash;
+
+ /*
+ * Feed crc32c hash instead of the raw value for entry
+ * hash calculation. This is to avoid walking
+ * potentially long value buffer again.
+ */
+ crc32c_hash = cpu_to_le32(
+ ext4_xattr_inode_get_hash(new_ea_inode));
+ hash = ext4_xattr_hash_entry(here->e_name,
+ here->e_name_len,
+ &crc32c_hash, 1);
+ } else if (is_block) {
+ __le32 *value = s->base + min_offs - new_size;
+
+ hash = ext4_xattr_hash_entry(here->e_name,
+ here->e_name_len, value,
+ new_size >> 2);
+ }
+ here->e_hash = hash;
+ }
+
+ if (is_block)
+ ext4_xattr_rehash((struct ext4_xattr_header *)s->base);
+
+ ret = 0;
+out:
+ iput(old_ea_inode);
+ iput(new_ea_inode);
+ return ret;
}
struct ext4_xattr_block_find {
@@ -794,15 +1784,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
{
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
- struct ext4_xattr_search *s = &bs->s;
+ struct ext4_xattr_search s_copy = bs->s;
+ struct ext4_xattr_search *s = &s_copy;
struct mb_cache_entry *ce = NULL;
int error = 0;
- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
+ struct inode *ea_inode = NULL;
+ size_t old_ea_inode_size = 0;
#define header(x) ((struct ext4_xattr_header *)(x))
- if (i->value && i->value_len > sb->s_blocksize)
- return -ENOSPC;
if (s->base) {
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
@@ -818,17 +1809,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- mb_cache_entry_delete_block(ext4_mb_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache)
+ mb_cache_entry_delete(ea_block_cache, hash,
+ bs->bh->b_blocknr);
ea_bdebug(bs->bh, "modifying in-place");
- error = ext4_xattr_set_entry(i, s);
- if (!error) {
- if (!IS_LAST_ENTRY(s->first))
- ext4_xattr_rehash(header(s->base),
- s->here);
- ext4_xattr_cache_insert(ext4_mb_cache,
- bs->bh);
- }
+ error = ext4_xattr_set_entry(i, s, handle, inode,
+ true /* is_block */);
+ if (!error)
+ ext4_xattr_block_cache_insert(ea_block_cache,
+ bs->bh);
ext4_xattr_block_csum_set(inode, bs->bh);
unlock_buffer(bs->bh);
if (error == -EFSCORRUPTED)
@@ -854,6 +1843,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
header(s->base)->h_refcount = cpu_to_le32(1);
s->here = ENTRY(s->base + offset);
s->end = s->base + bs->bh->b_size;
+
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ /*
+ * Defer quota free call for previous inode
+ * until success is guaranteed.
+ */
+ old_ea_inode_size = le32_to_cpu(
+ s->here->e_value_size);
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
+ }
}
} else {
/* Allocate a buffer where we construct the new block. */
@@ -870,17 +1877,33 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
s->end = s->base + sb->s_blocksize;
}
- error = ext4_xattr_set_entry(i, s);
+ error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */);
if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
- if (!IS_LAST_ENTRY(s->first))
- ext4_xattr_rehash(header(s->base), s->here);
+
+ if (i->value && s->here->e_value_inum) {
+ unsigned int ea_ino;
+
+ /*
+ * A ref count on ea_inode has been taken as part of the call to
+ * ext4_xattr_set_entry() above. We would like to drop this
+ * extra ref but we have to wait until the xattr block is
+ * initialized and has its own ref count on the ea_inode.
+ */
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+ if (error) {
+ ea_inode = NULL;
+ goto cleanup;
+ }
+ }
inserted:
if (!IS_LAST_ENTRY(s->first)) {
- new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+ new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
+ &ce);
if (new_bh) {
/* We found an identical block in the cache. */
if (new_bh == bs->bh)
@@ -888,6 +1911,8 @@ inserted:
else {
u32 ref;
+ WARN_ON_ONCE(dquot_initialize_needed(inode));
+
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -923,7 +1948,7 @@ inserted:
EXT4_C2B(EXT4_SB(sb),
1));
brelse(new_bh);
- mb_cache_entry_put(ext4_mb_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
ce = NULL;
new_bh = NULL;
goto inserted;
@@ -942,8 +1967,8 @@ inserted:
if (error)
goto cleanup_dquot;
}
- mb_cache_entry_touch(ext4_mb_cache, ce);
- mb_cache_entry_put(ext4_mb_cache, ce);
+ mb_cache_entry_touch(ea_block_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
ce = NULL;
} else if (bs->bh && s->base == bs->bh->b_data) {
/* We were modifying this block in-place. */
@@ -954,6 +1979,8 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal, block;
+ WARN_ON_ONCE(dquot_initialize_needed(inode));
+
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
@@ -980,6 +2007,22 @@ getblk_failed:
EXT4_FREE_BLOCKS_METADATA);
goto cleanup;
}
+ error = ext4_xattr_inode_inc_ref_all(handle, inode,
+ ENTRY(header(s->base)+1));
+ if (error)
+ goto getblk_failed;
+ if (ea_inode) {
+ /* Drop the extra ref on ea_inode. */
+ error = ext4_xattr_inode_dec_ref(handle,
+ ea_inode);
+ if (error)
+ ext4_warning_inode(ea_inode,
+ "dec ref error=%d",
+ error);
+ iput(ea_inode);
+ ea_inode = NULL;
+ }
+
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
@@ -991,7 +2034,7 @@ getblk_failed:
ext4_xattr_block_csum_set(inode, new_bh);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+ ext4_xattr_block_cache_insert(ea_block_cache, new_bh);
error = ext4_handle_dirty_metadata(handle, inode,
new_bh);
if (error)
@@ -999,17 +2042,40 @@ getblk_failed:
}
}
+ if (old_ea_inode_size)
+ ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+
/* Update the inode. */
EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
/* Drop the previous xattr block. */
- if (bs->bh && bs->bh != new_bh)
- ext4_xattr_release_block(handle, inode, bs->bh);
+ if (bs->bh && bs->bh != new_bh) {
+ struct ext4_xattr_inode_array *ea_inode_array = NULL;
+
+ ext4_xattr_release_block(handle, inode, bs->bh,
+ &ea_inode_array,
+ 0 /* extra_credits */);
+ ext4_xattr_inode_array_free(ea_inode_array);
+ }
error = 0;
cleanup:
+ if (ea_inode) {
+ int error2;
+
+ error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (error2)
+ ext4_warning_inode(ea_inode, "dec ref error=%d",
+ error2);
+
+ /* If there was an error, revert the quota charge. */
+ if (error)
+ ext4_xattr_inode_free_quota(inode,
+ i_size_read(ea_inode));
+ iput(ea_inode);
+ }
if (ce)
- mb_cache_entry_put(ext4_mb_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
brelse(new_bh);
if (!(bs->bh && s->base == bs->bh->b_data))
kfree(s->base);
@@ -1066,7 +2132,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
if (EXT4_I(inode)->i_extra_isize == 0)
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s);
+ error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error) {
if (error == -ENOSPC &&
ext4_has_inline_data(inode)) {
@@ -1078,7 +2144,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
error = ext4_xattr_ibody_find(inode, i, is);
if (error)
return error;
- error = ext4_xattr_set_entry(i, s);
+ error = ext4_xattr_set_entry(i, s, handle, inode,
+ false /* is_block */);
}
if (error)
return error;
@@ -1094,7 +2161,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
return 0;
}
-static int ext4_xattr_ibody_set(struct inode *inode,
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
@@ -1104,7 +2171,7 @@ static int ext4_xattr_ibody_set(struct inode *inode,
if (EXT4_I(inode)->i_extra_isize == 0)
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s);
+ error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
header = IHDR(inode, ext4_raw_inode(&is->iloc));
@@ -1123,12 +2190,31 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
{
void *value;
+ /* When e_value_inum is set the value is stored externally. */
+ if (s->here->e_value_inum)
+ return 0;
if (le32_to_cpu(s->here->e_value_size) != i->value_len)
return 0;
value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
return !memcmp(value, i->value, i->value_len);
}
+static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
+{
+ struct buffer_head *bh;
+ int error;
+
+ if (!EXT4_I(inode)->i_file_acl)
+ return NULL;
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh)
+ return ERR_PTR(-EIO);
+ error = ext4_xattr_check_block(inode, bh);
+ if (error)
+ return ERR_PTR(error);
+ return bh;
+}
+
/*
* ext4_xattr_set_handle()
*
@@ -1151,7 +2237,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
.name = name,
.value = value,
.value_len = value_len,
-
+ .in_inode = 0,
};
struct ext4_xattr_ibody_find is = {
.s = { .not_found = -ENODATA, },
@@ -1166,8 +2252,31 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
return -EINVAL;
if (strlen(name) > 255)
return -ERANGE;
+
ext4_write_lock_xattr(inode, &no_expand);
+ /* Check journal credits under write lock. */
+ if (ext4_handle_valid(handle)) {
+ struct buffer_head *bh;
+ int credits;
+
+ bh = ext4_xattr_get_block(inode);
+ if (IS_ERR(bh)) {
+ error = PTR_ERR(bh);
+ goto cleanup;
+ }
+
+ credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
+ value_len,
+ flags & XATTR_CREATE);
+ brelse(bh);
+
+ if (!ext4_handle_has_enough_credits(handle, credits)) {
+ error = -ENOSPC;
+ goto cleanup;
+ }
+ }
+
error = ext4_reserve_inode_write(handle, inode, &is.iloc);
if (error)
goto cleanup;
@@ -1197,9 +2306,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
+
if (!value) {
if (!is.s.not_found)
- error = ext4_xattr_ibody_set(inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
@@ -1210,7 +2320,12 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
goto cleanup;
- error = ext4_xattr_ibody_set(inode, &i, &is);
+ if (ext4_has_feature_ea_inode(inode->i_sb) &&
+ (EXT4_XATTR_SIZE(i.value_len) >
+ EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
+ i.in_inode = 1;
+retry_inode:
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1221,11 +2336,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
goto cleanup;
}
error = ext4_xattr_block_set(handle, inode, &i, &bs);
- if (error)
- goto cleanup;
- if (!is.s.not_found) {
+ if (!error && !is.s.not_found) {
i.value = NULL;
- error = ext4_xattr_ibody_set(inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i,
+ &is);
+ } else if (error == -ENOSPC) {
+ /*
+ * Xattr does not fit in the block, store at
+ * external inode if possible.
+ */
+ if (ext4_has_feature_ea_inode(inode->i_sb) &&
+ !i.in_inode) {
+ i.in_inode = 1;
+ goto retry_inode;
+ }
}
}
}
@@ -1251,6 +2375,33 @@ cleanup:
return error;
}
+int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+ bool is_create, int *credits)
+{
+ struct buffer_head *bh;
+ int err;
+
+ *credits = 0;
+
+ if (!EXT4_SB(inode->i_sb)->s_journal)
+ return 0;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+
+ bh = ext4_xattr_get_block(inode);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ } else {
+ *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
+ value_len, is_create);
+ brelse(bh);
+ err = 0;
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return err;
+}
+
/*
* ext4_xattr_set()
*
@@ -1264,10 +2415,20 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
const void *value, size_t value_len, int flags)
{
handle_t *handle;
+ struct super_block *sb = inode->i_sb;
int error, retries = 0;
- int credits = ext4_jbd2_credits_xattr(inode);
+ int credits;
+
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
retry:
+ error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE,
+ &credits);
+ if (error)
+ return error;
+
handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -1278,7 +2439,7 @@ retry:
value, value_len, flags);
error2 = ext4_journal_stop(handle);
if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ ext4_should_retry_alloc(sb, &retries))
goto retry;
if (error == 0)
error = error2;
@@ -1303,7 +2464,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
/* Adjust the value offsets of the entries */
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (last->e_value_size) {
+ if (!last->e_value_inum && last->e_value_size) {
new_offs = le16_to_cpu(last->e_value_offs) +
value_offs_shift;
last->e_value_offs = cpu_to_le16(new_offs);
@@ -1323,18 +2484,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
struct ext4_xattr_ibody_find *is = NULL;
struct ext4_xattr_block_find *bs = NULL;
char *buffer = NULL, *b_entry_name = NULL;
- size_t value_offs, value_size;
+ size_t value_size = le32_to_cpu(entry->e_value_size);
struct ext4_xattr_info i = {
.value = NULL,
.value_len = 0,
.name_index = entry->e_name_index,
+ .in_inode = !!entry->e_value_inum,
};
struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
int error;
- value_offs = le16_to_cpu(entry->e_value_offs);
- value_size = le32_to_cpu(entry->e_value_size);
-
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
buffer = kmalloc(value_size, GFP_NOFS);
@@ -1350,7 +2509,15 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
bs->bh = NULL;
/* Save the entry name and the entry value */
- memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ if (entry->e_value_inum) {
+ error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
+ if (error)
+ goto out;
+ } else {
+ size_t value_offs = le16_to_cpu(entry->e_value_offs);
+ memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ }
+
memcpy(b_entry_name, entry->e_name, entry->e_name_len);
b_entry_name[entry->e_name_len] = '\0';
i.name = b_entry_name;
@@ -1364,11 +2531,10 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
goto out;
/* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(inode, &i, is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
if (error)
goto out;
- i.name = b_entry_name;
i.value = buffer;
i.value_len = value_size;
error = ext4_xattr_block_find(inode, &i, bs);
@@ -1412,9 +2578,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
last = IFIRST(header);
/* Find the entry best suited to be pushed into EA block */
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- total_size =
- EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
- EXT4_XATTR_LEN(last->e_name_len);
+ total_size = EXT4_XATTR_LEN(last->e_name_len);
+ if (!last->e_value_inum)
+ total_size += EXT4_XATTR_SIZE(
+ le32_to_cpu(last->e_value_size));
if (total_size <= bfree &&
total_size < min_total_size) {
if (total_size + ifree < isize_diff) {
@@ -1433,8 +2600,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
}
entry_size = EXT4_XATTR_LEN(entry->e_name_len);
- total_size = entry_size +
- EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+ total_size = entry_size;
+ if (!entry->e_value_inum)
+ total_size += EXT4_XATTR_SIZE(
+ le32_to_cpu(entry->e_value_size));
error = ext4_xattr_move_to_block(handle, inode, raw_inode,
entry);
if (error)
@@ -1563,51 +2732,172 @@ cleanup:
return error;
}
+#define EIA_INCR 16 /* must be 2^n */
+#define EIA_MASK (EIA_INCR - 1)
+/* Add the large xattr @inode into @ea_inode_array for deferred iput().
+ * If @ea_inode_array is new or full it will be grown and the old
+ * contents copied over.
+ */
+static int
+ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
+ struct inode *inode)
+{
+ if (*ea_inode_array == NULL) {
+ /*
+ * Start with 15 inodes, so it fits into a power-of-two size.
+ * If *ea_inode_array is NULL, this is essentially offsetof()
+ */
+ (*ea_inode_array) =
+ kmalloc(offsetof(struct ext4_xattr_inode_array,
+ inodes[EIA_MASK]),
+ GFP_NOFS);
+ if (*ea_inode_array == NULL)
+ return -ENOMEM;
+ (*ea_inode_array)->count = 0;
+ } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
+ /* expand the array once all 15 + n * 16 slots are full */
+ struct ext4_xattr_inode_array *new_array = NULL;
+ int count = (*ea_inode_array)->count;
+
+ /* if new_array is NULL, this is essentially offsetof() */
+ new_array = kmalloc(
+ offsetof(struct ext4_xattr_inode_array,
+ inodes[count + EIA_INCR]),
+ GFP_NOFS);
+ if (new_array == NULL)
+ return -ENOMEM;
+ memcpy(new_array, *ea_inode_array,
+ offsetof(struct ext4_xattr_inode_array, inodes[count]));
+ kfree(*ea_inode_array);
+ *ea_inode_array = new_array;
+ }
+ (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
+ return 0;
+}
/*
* ext4_xattr_delete_inode()
*
- * Free extended attribute resources associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode.
+ * Free extended attribute resources associated with this inode. Traverse
+ * all entries and decrement reference on any xattr inodes associated with this
+ * inode. This is called immediately before an inode is freed. We have exclusive
+ * access to the inode. If an orphan inode is deleted it will also release its
+ * references on xattr block and xattr inodes.
*/
-void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_inode_array **ea_inode_array,
+ int extra_credits)
{
struct buffer_head *bh = NULL;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_iloc iloc = { .bh = NULL };
+ struct ext4_xattr_entry *entry;
+ int error;
- if (!EXT4_I(inode)->i_file_acl)
- goto cleanup;
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh) {
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
+ error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
+ NULL /* bh */,
+ false /* dirty */,
+ false /* block_csum */);
+ if (error) {
+ EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
goto cleanup;
}
- if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
- BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- EXT4_ERROR_INODE(inode, "bad block %llu",
- EXT4_I(inode)->i_file_acl);
- goto cleanup;
+
+ if (ext4_has_feature_ea_inode(inode->i_sb) &&
+ ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error) {
+ EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
+ goto cleanup;
+ }
+
+ error = ext4_journal_get_write_access(handle, iloc.bh);
+ if (error) {
+ EXT4_ERROR_INODE(inode, "write access (error %d)",
+ error);
+ goto cleanup;
+ }
+
+ header = IHDR(inode, ext4_raw_inode(&iloc));
+ if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+ ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
+ IFIRST(header),
+ false /* block_csum */,
+ ea_inode_array,
+ extra_credits,
+ false /* skip_quota */);
}
- ext4_xattr_release_block(handle, inode, bh);
- EXT4_I(inode)->i_file_acl = 0;
+ if (EXT4_I(inode)->i_file_acl) {
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ error = ext4_xattr_check_block(inode, bh);
+ if (error) {
+ EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
+ EXT4_I(inode)->i_file_acl, error);
+ goto cleanup;
+ }
+
+ if (ext4_has_feature_ea_inode(inode->i_sb)) {
+ for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
+ entry = EXT4_XATTR_NEXT(entry))
+ if (entry->e_value_inum)
+ ext4_xattr_inode_free_quota(inode,
+ le32_to_cpu(entry->e_value_size));
+
+ }
+
+ ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+ extra_credits);
+ /*
+ * Update i_file_acl value in the same transaction that releases
+ * block.
+ */
+ EXT4_I(inode)->i_file_acl = 0;
+ error = ext4_mark_inode_dirty(handle, inode);
+ if (error) {
+ EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
+ error);
+ goto cleanup;
+ }
+ }
+ error = 0;
cleanup:
+ brelse(iloc.bh);
brelse(bh);
+ return error;
+}
+
+void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
+{
+ int idx;
+
+ if (ea_inode_array == NULL)
+ return;
+
+ for (idx = 0; idx < ea_inode_array->count; ++idx)
+ iput(ea_inode_array->inodes[idx]);
+ kfree(ea_inode_array);
}
/*
- * ext4_xattr_cache_insert()
+ * ext4_xattr_block_cache_insert()
*
- * Create a new entry in the extended attribute cache, and insert
+ * Create a new entry in the extended attribute block cache, and insert
* it unless such an entry is already in the cache.
*
* Returns 0, or a negative error number on failure.
*/
static void
-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
+ struct buffer_head *bh)
{
struct ext4_xattr_header *header = BHDR(bh);
__u32 hash = le32_to_cpu(header->h_hash);
@@ -1615,7 +2905,9 @@ ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
EXT4_XATTR_REFCOUNT_MAX;
int error;
- error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
+ if (!ea_block_cache)
+ return;
+ error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash,
bh->b_blocknr, reusable);
if (error) {
if (error == -EBUSY)
@@ -1647,11 +2939,11 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
entry1->e_name_index != entry2->e_name_index ||
entry1->e_name_len != entry2->e_name_len ||
entry1->e_value_size != entry2->e_value_size ||
+ entry1->e_value_inum != entry2->e_value_inum ||
memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
return 1;
- if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
- return -EFSCORRUPTED;
- if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+ if (!entry1->e_value_inum &&
+ memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
(char *)header2 + le16_to_cpu(entry2->e_value_offs),
le32_to_cpu(entry1->e_value_size)))
return 1;
@@ -1665,7 +2957,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
}
/*
- * ext4_xattr_cache_find()
+ * ext4_xattr_block_cache_find()
*
* Find an identical extended attribute block.
*
@@ -1673,30 +2965,33 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
* not found or an error occurred.
*/
static struct buffer_head *
-ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
- struct mb_cache_entry **pce)
+ext4_xattr_block_cache_find(struct inode *inode,
+ struct ext4_xattr_header *header,
+ struct mb_cache_entry **pce)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
+ if (!ea_block_cache)
+ return NULL;
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
- ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
+ ce = mb_cache_entry_find_first(ea_block_cache, hash);
while (ce) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, ce->e_block);
+ bh = sb_bread(inode->i_sb, ce->e_value);
if (!bh) {
EXT4_ERROR_INODE(inode, "block %lu read error",
- (unsigned long) ce->e_block);
+ (unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
+ ce = mb_cache_entry_find_next(ea_block_cache, ce);
}
return NULL;
}
@@ -1709,30 +3004,22 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
*
* Compute the hash of an extended attribute.
*/
-static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
- struct ext4_xattr_entry *entry)
+static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
+ size_t value_count)
{
__u32 hash = 0;
- char *name = entry->e_name;
- int n;
- for (n = 0; n < entry->e_name_len; n++) {
+ while (name_len--) {
hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
*name++;
}
-
- if (entry->e_value_size != 0) {
- __le32 *value = (__le32 *)((char *)header +
- le16_to_cpu(entry->e_value_offs));
- for (n = (le32_to_cpu(entry->e_value_size) +
- EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
- hash = (hash << VALUE_HASH_SHIFT) ^
- (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
- le32_to_cpu(*value++);
- }
+ while (value_count--) {
+ hash = (hash << VALUE_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+ le32_to_cpu(*value++);
}
- entry->e_hash = cpu_to_le32(hash);
+ return cpu_to_le32(hash);
}
#undef NAME_HASH_SHIFT
@@ -1745,13 +3032,11 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
*
* Re-compute the extended attribute hash value after an entry has changed.
*/
-static void ext4_xattr_rehash(struct ext4_xattr_header *header,
- struct ext4_xattr_entry *entry)
+static void ext4_xattr_rehash(struct ext4_xattr_header *header)
{
struct ext4_xattr_entry *here;
__u32 hash = 0;
- ext4_xattr_hash_entry(header, entry);
here = ENTRY(header+1);
while (!IS_LAST_ENTRY(here)) {
if (!here->e_hash) {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 099c8b670ef5..0d2dde1fa87a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -44,7 +44,7 @@ struct ext4_xattr_entry {
__u8 e_name_len; /* length of name */
__u8 e_name_index; /* attribute name index */
__le16 e_value_offs; /* offset in disk block of value */
- __le32 e_value_block; /* disk block attribute is stored on (n/i) */
+ __le32 e_value_inum; /* inode in which the value is stored */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
char e_name[0]; /* attribute name */
@@ -69,6 +69,13 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+/*
+ * The minimum size of EA value when you start storing it in an external inode
+ * size of block - size of header - size of 1 entry - 4 null bytes
+*/
+#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \
+ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
+
#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
@@ -77,10 +84,11 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
struct ext4_xattr_info {
- int name_index;
const char *name;
const void *value;
size_t value_len;
+ int name_index;
+ int in_inode;
};
struct ext4_xattr_search {
@@ -96,6 +104,11 @@ struct ext4_xattr_ibody_find {
struct ext4_iloc iloc;
};
+struct ext4_xattr_inode_array {
+ unsigned int count; /* # of used items in the array */
+ struct inode *inodes[0];
+};
+
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
@@ -139,8 +152,16 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
+ bool is_create, int *credits);
+extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
+ struct buffer_head *block_bh, size_t value_len,
+ bool is_create);
-extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_inode_array **array,
+ int extra_credits);
+extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
@@ -169,3 +190,11 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode,
return 0;
}
#endif
+
+#ifdef CONFIG_LOCKDEP
+extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
+#else
+static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
+#endif
+
+extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 251356859476..87c1f4150c64 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio)
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
f2fs_show_injection_info(FAULT_IO);
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
}
#endif
if (f2fs_bio_encrypted(bio)) {
- if (bio->bi_error) {
+ if (bio->bi_status) {
fscrypt_release_ctx(bio->bi_private);
} else {
fscrypt_decrypt_bio_pages(bio->bi_private, bio);
@@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (!bio->bi_error) {
+ if (!bio->bi_status) {
if (!PageUptodate(page))
SetPageUptodate(page);
} else {
@@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio)
unlock_page(page);
mempool_free(page, sbi->write_io_dummy);
- if (unlikely(bio->bi_error))
+ if (unlikely(bio->bi_status))
f2fs_stop_checkpoint(sbi, true);
continue;
}
fscrypt_pullback_bio_page(&page, true);
- if (unlikely(bio->bi_error)) {
+ if (unlikely(bio->bi_status)) {
mapping_set_error(page->mapping, -EIO);
f2fs_stop_checkpoint(sbi, true);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 42c39f0bfd88..94a88b233e98 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1116,6 +1116,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
{
SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = sbi->s_chksum_driver;
@@ -1125,7 +1126,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
@@ -2161,26 +2164,6 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
return kmalloc(size, flags);
}
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
#define get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 527c9f36d971..a0e6d2c65a9e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1063,11 +1063,11 @@ static int __exchange_data_block(struct inode *src_inode,
while (len) {
olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
- src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
if (!src_blkaddr)
return -ENOMEM;
- do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
if (!do_replace) {
kvfree(src_blkaddr);
return -ENOMEM;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3ed2f947f5da..d53fe620939e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2661,17 +2661,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
- nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+ nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
GFP_KERNEL);
if (!nm_i->nat_block_bitmap)
return -ENOMEM;
- nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
sizeof(unsigned short), GFP_KERNEL);
if (!nm_i->free_nid_count)
return -ENOMEM;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c246e351103..f964b68718c1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -757,7 +757,7 @@ static void f2fs_submit_discard_endio(struct bio *bio)
{
struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
- dc->error = bio->bi_error;
+ dc->error = blk_status_to_errno(bio->bi_status);
dc->state = D_DONE;
complete_all(&dc->wait);
bio_put(bio);
@@ -2918,13 +2918,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -2957,7 +2957,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -2990,7 +2990,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+ sit_i->mounted_time = ktime_get_real_seconds();
mutex_init(&sit_i->sentry_lock);
return 0;
}
@@ -3008,12 +3008,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -3193,7 +3193,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -3215,7 +3215,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e9ba1f1d9723..6b871b492fd5 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -716,8 +716,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
- return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
- sit_i->mounted_time;
+ time64_t now = ktime_get_real_seconds();
+
+ return sit_i->elapsed_time + now - sit_i->mounted_time;
}
static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index dd92170e0d6d..32e4c025e97e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1967,7 +1967,7 @@ try_onemore:
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
- memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+ memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8bd81c2e89b2..3b01b646e528 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -109,20 +109,34 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
}
EXPORT_SYMBOL(__f_setown);
-void f_setown(struct file *filp, unsigned long arg, int force)
+int f_setown(struct file *filp, unsigned long arg, int force)
{
enum pid_type type;
- struct pid *pid;
- int who = arg;
+ struct pid *pid = NULL;
+ int who = arg, ret = 0;
+
type = PIDTYPE_PID;
if (who < 0) {
+ /* avoid overflow below */
+ if (who == INT_MIN)
+ return -EINVAL;
+
type = PIDTYPE_PGID;
who = -who;
}
+
rcu_read_lock();
- pid = find_vpid(who);
- __f_setown(filp, pid, type, force);
+ if (who) {
+ pid = find_vpid(who);
+ if (!pid)
+ ret = -ESRCH;
+ }
+
+ if (!ret)
+ __f_setown(filp, pid, type, force);
rcu_read_unlock();
+
+ return ret;
}
EXPORT_SYMBOL(f_setown);
@@ -243,9 +257,72 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
+static bool rw_hint_valid(enum rw_hint hint)
+{
+ switch (hint) {
+ case RWF_WRITE_LIFE_NOT_SET:
+ case RWH_WRITE_LIFE_NONE:
+ case RWH_WRITE_LIFE_SHORT:
+ case RWH_WRITE_LIFE_MEDIUM:
+ case RWH_WRITE_LIFE_LONG:
+ case RWH_WRITE_LIFE_EXTREME:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ u64 *argp = (u64 __user *)arg;
+ enum rw_hint hint;
+ u64 h;
+
+ switch (cmd) {
+ case F_GET_FILE_RW_HINT:
+ h = file_write_hint(file);
+ if (copy_to_user(argp, &h, sizeof(*argp)))
+ return -EFAULT;
+ return 0;
+ case F_SET_FILE_RW_HINT:
+ if (copy_from_user(&h, argp, sizeof(h)))
+ return -EFAULT;
+ hint = (enum rw_hint) h;
+ if (!rw_hint_valid(hint))
+ return -EINVAL;
+
+ spin_lock(&file->f_lock);
+ file->f_write_hint = hint;
+ spin_unlock(&file->f_lock);
+ return 0;
+ case F_GET_RW_HINT:
+ h = inode->i_write_hint;
+ if (copy_to_user(argp, &h, sizeof(*argp)))
+ return -EFAULT;
+ return 0;
+ case F_SET_RW_HINT:
+ if (copy_from_user(&h, argp, sizeof(h)))
+ return -EFAULT;
+ hint = (enum rw_hint) h;
+ if (!rw_hint_valid(hint))
+ return -EINVAL;
+
+ inode_lock(inode);
+ inode->i_write_hint = hint;
+ inode_unlock(inode);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
+ void __user *argp = (void __user *)arg;
+ struct flock flock;
long err = -EINVAL;
switch (cmd) {
@@ -273,7 +350,11 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_OFD_GETLK:
#endif
case F_GETLK:
- err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
+ if (copy_from_user(&flock, argp, sizeof(flock)))
+ return -EFAULT;
+ err = fcntl_getlk(filp, cmd, &flock);
+ if (!err && copy_to_user(argp, &flock, sizeof(flock)))
+ return -EFAULT;
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
@@ -283,7 +364,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
/* Fallthrough */
case F_SETLK:
case F_SETLKW:
- err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
+ if (copy_from_user(&flock, argp, sizeof(flock)))
+ return -EFAULT;
+ err = fcntl_setlk(fd, filp, cmd, &flock);
break;
case F_GETOWN:
/*
@@ -297,8 +380,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
force_successful_syscall_return();
break;
case F_SETOWN:
- f_setown(filp, arg, 1);
- err = 0;
+ err = f_setown(filp, arg, 1);
break;
case F_GETOWN_EX:
err = f_getown_ex(filp, arg);
@@ -337,6 +419,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
+ case F_GET_RW_HINT:
+ case F_SET_RW_HINT:
+ case F_GET_FILE_RW_HINT:
+ case F_SET_FILE_RW_HINT:
+ err = fcntl_rw_hint(filp, cmd, arg);
+ break;
default:
break;
}
@@ -383,7 +471,9 @@ out:
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
+ void __user *argp = (void __user *)arg;
struct fd f = fdget_raw(fd);
+ struct flock64 flock;
long err = -EBADF;
if (!f.file)
@@ -401,14 +491,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
switch (cmd) {
case F_GETLK64:
case F_OFD_GETLK:
- err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
+ err = -EFAULT;
+ if (copy_from_user(&flock, argp, sizeof(flock)))
+ break;
+ err = fcntl_getlk64(f.file, cmd, &flock);
+ if (!err && copy_to_user(argp, &flock, sizeof(flock)))
+ err = -EFAULT;
break;
case F_SETLK64:
case F_SETLKW64:
case F_OFD_SETLK:
case F_OFD_SETLKW:
- err = fcntl_setlk64(fd, f.file, cmd,
- (struct flock64 __user *) arg);
+ err = -EFAULT;
+ if (copy_from_user(&flock, argp, sizeof(flock)))
+ break;
+ err = fcntl_setlk64(fd, f.file, cmd, &flock);
break;
default:
err = do_fcntl(fd, cmd, arg, f.file);
@@ -422,57 +519,56 @@ out:
#endif
#ifdef CONFIG_COMPAT
-static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
+/* careful - don't use anywhere else */
+#define copy_flock_fields(dst, src) \
+ (dst)->l_type = (src)->l_type; \
+ (dst)->l_whence = (src)->l_whence; \
+ (dst)->l_start = (src)->l_start; \
+ (dst)->l_len = (src)->l_len; \
+ (dst)->l_pid = (src)->l_pid;
+
+static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
+{
+ struct compat_flock fl;
+
+ if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
return -EFAULT;
+ copy_flock_fields(kfl, &fl);
return 0;
}
-static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
+ struct compat_flock64 fl;
+
+ if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
return -EFAULT;
+ copy_flock_fields(kfl, &fl);
return 0;
}
-#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
-static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
+ struct compat_flock fl;
+
+ memset(&fl, 0, sizeof(struct compat_flock));
+ copy_flock_fields(&fl, kfl);
+ if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
return -EFAULT;
return 0;
}
-#endif
-#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
-static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
+ struct compat_flock64 fl;
+
+ memset(&fl, 0, sizeof(struct compat_flock64));
+ copy_flock_fields(&fl, kfl);
+ if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
return -EFAULT;
return 0;
}
-#endif
+#undef copy_flock_fields
static unsigned int
convert_fcntl_cmd(unsigned int cmd)
@@ -489,76 +585,92 @@ convert_fcntl_cmd(unsigned int cmd)
return cmd;
}
+/*
+ * GETLK was successful and we need to return the data, but it needs to fit in
+ * the compat structure.
+ * l_start shouldn't be too big, unless the original start + end is greater than
+ * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
+ * -EOVERFLOW in that case. l_len could be too big, in which case we just
+ * truncate it, and only allow the app to see that part of the conflicting lock
+ * that might make sense to it anyway
+ */
+static int fixup_compat_flock(struct flock *flock)
+{
+ if (flock->l_start > COMPAT_OFF_T_MAX)
+ return -EOVERFLOW;
+ if (flock->l_len > COMPAT_OFF_T_MAX)
+ flock->l_len = COMPAT_OFF_T_MAX;
+ return 0;
+}
+
COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
- mm_segment_t old_fs;
- struct flock f;
- long ret;
- unsigned int conv_cmd;
+ struct fd f = fdget_raw(fd);
+ struct flock flock;
+ long err = -EBADF;
+
+ if (!f.file)
+ return err;
+
+ if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (!check_fcntl_cmd(cmd))
+ goto out_put;
+ }
+
+ err = security_file_fcntl(f.file, cmd, arg);
+ if (err)
+ goto out_put;
switch (cmd) {
case F_GETLK:
+ err = get_compat_flock(&flock, compat_ptr(arg));
+ if (err)
+ break;
+ err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ if (err)
+ break;
+ err = fixup_compat_flock(&flock);
+ if (err)
+ return err;
+ err = put_compat_flock(&flock, compat_ptr(arg));
+ break;
+ case F_GETLK64:
+ case F_OFD_GETLK:
+ err = get_compat_flock64(&flock, compat_ptr(arg));
+ if (err)
+ break;
+ err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ if (err)
+ break;
+ err = fixup_compat_flock(&flock);
+ if (err)
+ return err;
+ err = put_compat_flock64(&flock, compat_ptr(arg));
+ break;
case F_SETLK:
case F_SETLKW:
- ret = get_compat_flock(&f, compat_ptr(arg));
- if (ret != 0)
+ err = get_compat_flock(&flock, compat_ptr(arg));
+ if (err)
break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_fcntl(fd, cmd, (unsigned long)&f);
- set_fs(old_fs);
- if (cmd == F_GETLK && ret == 0) {
- /* GETLK was successful and we need to return the data...
- * but it needs to fit in the compat structure.
- * l_start shouldn't be too big, unless the original
- * start + end is greater than COMPAT_OFF_T_MAX, in which
- * case the app was asking for trouble, so we return
- * -EOVERFLOW in that case.
- * l_len could be too big, in which case we just truncate it,
- * and only allow the app to see that part of the conflicting
- * lock that might make sense to it anyway
- */
-
- if (f.l_start > COMPAT_OFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_OFF_T_MAX)
- f.l_len = COMPAT_OFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock(&f, compat_ptr(arg));
- }
+ err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
break;
-
- case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
- case F_OFD_GETLK:
case F_OFD_SETLK:
case F_OFD_SETLKW:
- ret = get_compat_flock64(&f, compat_ptr(arg));
- if (ret != 0)
+ err = get_compat_flock64(&flock, compat_ptr(arg));
+ if (err)
break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- conv_cmd = convert_fcntl_cmd(cmd);
- ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
- set_fs(old_fs);
- if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
- /* need to return lock information - see above for commentary */
- if (f.l_start > COMPAT_LOFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_LOFF_T_MAX)
- f.l_len = COMPAT_LOFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock64(&f, compat_ptr(arg));
- }
+ err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
break;
-
default:
- ret = sys_fcntl(fd, cmd, arg);
+ err = do_fcntl(fd, cmd, arg, f.file);
break;
}
- return ret;
+out_put:
+ fdput(f);
+ return err;
}
COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
@@ -899,16 +1011,10 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
- O_RDONLY | O_WRONLY | O_RDWR |
- O_CREAT | O_EXCL | O_NOCTTY |
- O_TRUNC | O_APPEND | /* O_NONBLOCK | */
- __O_SYNC | O_DSYNC | FASYNC |
- O_DIRECT | O_LARGEFILE | O_DIRECTORY |
- O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH | __O_TMPFILE |
- __FMODE_NONOTIFY
- ));
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ HWEIGHT32(
+ (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+ __FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
diff --git a/fs/file.c b/fs/file.c
index ad6f094f2eff..1fc7fbbb4510 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG;
unsigned int sysctl_nr_open_max =
__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
-static void *alloc_fdmem(size_t size)
-{
- /*
- * Very large allocations can stress page reclaim, so fall back to
- * vmalloc() if the allocation size will be considered "large" by the VM.
- */
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
- __GFP_NOWARN | __GFP_NORETRY);
- if (data != NULL)
- return data;
- }
- return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
-}
-
static void __free_fdtable(struct fdtable *fdt)
{
kvfree(fdt->fd);
@@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (!fdt)
goto out;
fdt->max_fds = nr;
- data = alloc_fdmem(nr * sizeof(struct file *));
+ data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
if (!data)
goto out_fdt;
fdt->fd = data;
- data = alloc_fdmem(max_t(size_t,
- 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
+ data = kvmalloc(max_t(size_t,
+ 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
+ GFP_KERNEL_ACCOUNT);
if (!data)
goto out_arr;
fdt->open_fds = data;
diff --git a/fs/file_table.c b/fs/file_table.c
index 954d510b765a..72e861a35a7f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
+ file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
if ((mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
mode |= FMODE_CAN_READ;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index cac75547d35c..8b99955e3504 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -275,8 +275,10 @@ struct file_system_type *get_fs_type(const char *name)
int len = dot ? dot - name : strlen(name);
fs = __get_fs_type(name, len);
- if (!fs && (request_module("fs-%.*s", len, name) == 0))
+ if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
+ WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
+ }
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 63ee2940775c..8b426f83909f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2052,11 +2052,13 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
}
/**
- * __mark_inode_dirty - internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
+ * __mark_inode_dirty - internal function
+ *
+ * @inode: inode to mark
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
+ * mark_inode_dirty_sync.
*
* Put the inode on the super block's dirty list.
*
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 611b5408f6ec..e747b3d720ee 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
void pin_kill(struct fs_pin *p)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (!p) {
rcu_read_unlock();
@@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p)
rcu_read_unlock();
schedule();
rcu_read_lock();
- if (likely(list_empty(&wait.task_list)))
+ if (likely(list_empty(&wait.entry)))
break;
/* OK, we know p couldn't have been freed yet */
spin_lock_irq(&p->wait.lock);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 6e22748b0704..b9ea99c5b5b3 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct tree_descr empty_descr = {""};
+ static const struct tree_descr empty_descr = {""};
struct fuse_conn *fc;
int err;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c2d7f3a92679..c16d00e53264 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -20,6 +20,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/sched.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
- atomic_set(&req->count, 1);
+ refcount_set(&req->count, 1);
req->pages = pages;
req->page_descs = page_descs;
req->max_pages = npages;
@@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req)
void __fuse_get_request(struct fuse_req *req)
{
- atomic_inc(&req->count);
+ refcount_inc(&req->count);
}
/* Must be called with > 1 refcount */
static void __fuse_put_request(struct fuse_req *req)
{
- BUG_ON(atomic_read(&req->count) < 2);
- atomic_dec(&req->count);
+ refcount_dec(&req->count);
}
-static void fuse_req_init_context(struct fuse_req *req)
+static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
{
req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
- req->in.h.pid = current->pid;
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
void fuse_set_initialized(struct fuse_conn *fc)
@@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
goto out;
}
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
@@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
if (!req)
req = get_reserved_req(fc, file);
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
__clear_bit(FR_BACKGROUND, &req->flags);
return req;
@@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
- if (atomic_dec_and_test(&req->count)) {
+ if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
* We get here in the unlikely case that a background
@@ -1222,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_in *in;
unsigned reqsize;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
restart:
spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
@@ -1820,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..3ee4fdc3da9e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
}
INIT_LIST_HEAD(&ff->write_entry);
- atomic_set(&ff->count, 1);
+ refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff)
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
{
- atomic_inc(&ff->count);
+ refcount_inc(&ff->count);
return ff;
}
@@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_file_put(struct fuse_file *ff, bool sync)
{
- if (atomic_dec_and_test(&ff->count)) {
+ if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
if (ff->fc->no_open) {
@@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file)
void fuse_sync_release(struct fuse_file *ff, int flags)
{
- WARN_ON(atomic_read(&ff->count) != 1);
+ WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
@@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
-static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+static int convert_fuse_file_lock(struct fuse_conn *fc,
+ const struct fuse_file_lock *ffl,
struct file_lock *fl)
{
switch (ffl->type) {
@@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
fl->fl_start = ffl->start;
fl->fl_end = ffl->end;
- fl->fl_pid = ffl->pid;
+
+ /*
+ * Convert pid into the caller's pid namespace. If the pid
+ * does not map into the namespace fl_pid will get set to 0.
+ */
+ rcu_read_lock();
+ fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns));
+ rcu_read_unlock();
break;
default:
@@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err)
- err = convert_fuse_file_lock(&outarg.lk, fl);
+ err = convert_fuse_file_lock(fc, &outarg.lk, fl);
return err;
}
@@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
- pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+ struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2168,10 +2177,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
}
/* Unlock on close is handled by the flush method */
- if (fl->fl_flags & FL_CLOSE)
+ if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
- fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+ if (pid && pid_nr == 0)
+ return -EOVERFLOW;
+
+ fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fc, &args);
/* locking is restartable */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f33341d9501a..1bd7ffdad593 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -24,6 +24,8 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/xattr.h>
+#include <linux/pid_namespace.h>
+#include <linux/refcount.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -137,7 +139,7 @@ struct fuse_file {
u64 nodeid;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** FOPEN_* flags returned by open */
u32 open_flags;
@@ -306,7 +308,7 @@ struct fuse_req {
struct list_head intr_entry;
/** refcount */
- atomic_t count;
+ refcount_t count;
/** Unique ID for the interrupt request */
u64 intr_unique;
@@ -448,7 +450,7 @@ struct fuse_conn {
spinlock_t lock;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** Number of fuse_dev's */
atomic_t dev_count;
@@ -461,6 +463,9 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;
+ /** The pid namespace for this mount */
+ struct pid_namespace *pid_ns;
+
/** Maximum read size */
unsigned max_read;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73cf05135252..65c88379a3a1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
+#include <linux/pid_namespace.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -601,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc)
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
init_rwsem(&fc->killsb);
- atomic_set(&fc->count, 1);
+ refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
@@ -619,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->connected = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
- if (atomic_dec_and_test(&fc->count)) {
+ if (refcount_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
+ put_pid_ns(fc->pid_ns);
fc->release(fc);
}
}
@@ -634,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
{
- atomic_inc(&fc->count);
+ refcount_inc(&fc->count);
return fc;
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
@@ -972,8 +975,15 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
int err;
char *suffix = "";
- if (sb->s_bdev)
+ if (sb->s_bdev) {
suffix = "-fuseblk";
+ /*
+ * sb->s_bdi points to blkdev's bdi however we want to redirect
+ * it to our private bdi...
+ */
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
MINOR(fc->dev), suffix);
if (err)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3814a60e0aea..9fa3aef9a5b3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -970,7 +970,7 @@ more_rgrps:
continue;
bn = be64_to_cpu(*p);
if (gfs2_holder_initialized(rd_gh)) {
- rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+ rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
gfs2_assert_withdraw(sdp,
gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
} else {
@@ -1072,7 +1072,7 @@ out_unlock:
/* Every transaction boundary, we rewrite the dinode
to keep its di_blocks current in case of failure. */
ip->i_inode.i_mtime = ip->i_inode.i_ctime =
- CURRENT_TIME;
+ current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
@@ -1293,7 +1293,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
gfs2_statfs_change(sdp, 0, +btotal, 0);
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 79113219be5f..db427658ccd9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1444,7 +1444,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
"g.offset (%u)\n",
(unsigned long long)bh->b_blocknr,
entries2, g.offset);
-
+ gfs2_consist_inode(ip);
error = -EIO;
goto out_free;
}
@@ -1612,6 +1612,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
(unsigned long long)dip->i_no_addr,
dip->i_entries,
g.offset);
+ gfs2_consist_inode(dip);
error = -EIO;
goto out;
}
@@ -2031,8 +2032,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd;
- rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
+
rg_blocks += rgd->rd_length;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 959a19ced4d5..c38ab6c81898 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = {
static struct rhashtable gl_hash_table;
-void gfs2_glock_free(struct gfs2_glock *gl)
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
{
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl)
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(gfs2_glock_cachep, gl);
}
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
}
@@ -152,20 +159,34 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
spin_unlock(&lru_lock);
}
-/**
- * gfs2_glock_put() - Decrement reference count on glock
- * @gl: The glock to put
- *
+/*
+ * Enqueue the glock on the work queue. Passes one glock reference on to the
+ * work queue.
*/
+static void __gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) {
+ if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) {
+ /*
+ * We are holding the lockref spinlock, and the work was still
+ * queued above. The queued work (glock_work_func) takes that
+ * spinlock before dropping its glock reference(s), so it
+ * cannot have dropped them in the meantime.
+ */
+ GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2);
+ gl->gl_lockref.count--;
+ }
+}
-void gfs2_glock_put(struct gfs2_glock *gl)
+static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) {
+ spin_lock(&gl->gl_lockref.lock);
+ __gfs2_glock_queue_work(gl, delay);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+static void __gfs2_glock_put(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
- if (lockref_put_or_lock(&gl->gl_lockref))
- return;
-
lockref_mark_dead(&gl->gl_lockref);
gfs2_glock_remove_from_lru(gl);
@@ -178,6 +199,20 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+void gfs2_glock_put(struct gfs2_glock *gl)
+{
+ if (lockref_put_or_lock(&gl->gl_lockref))
+ return;
+
+ __gfs2_glock_put(gl);
+}
+
+/**
* may_grant - check if its ok to grant a new lock
* @gl: The glock
* @gh: The lock request which we wish to grant
@@ -482,8 +517,7 @@ __acquires(&gl->gl_lockref.lock)
target == LM_ST_UNLOCKED &&
test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
finish_xmote(gl, target);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
+ gfs2_glock_queue_work(gl, 0);
}
else if (ret) {
pr_err("lm_lock ret %d\n", ret);
@@ -492,8 +526,7 @@ __acquires(&gl->gl_lockref.lock)
}
} else { /* lock_nolock */
finish_xmote(gl, target);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
+ gfs2_glock_queue_work(gl, 0);
}
spin_lock(&gl->gl_lockref.lock);
@@ -565,8 +598,7 @@ out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
smp_mb__after_atomic();
gl->gl_lockref.count++;
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gl->gl_lockref.count--;
+ __gfs2_glock_queue_work(gl, 0);
return;
out_unlock:
@@ -601,11 +633,11 @@ static void glock_work_func(struct work_struct *work)
{
unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
- int drop_ref = 0;
+ unsigned int drop_refs = 1;
if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
finish_xmote(gl, gl->gl_reply);
- drop_ref = 1;
+ drop_refs++;
}
spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -623,17 +655,25 @@ static void glock_work_func(struct work_struct *work)
}
}
run_queue(gl, 0);
- spin_unlock(&gl->gl_lockref.lock);
- if (!delay)
- gfs2_glock_put(gl);
- else {
+ if (delay) {
+ /* Keep one glock reference for the work we requeue. */
+ drop_refs--;
if (gl->gl_name.ln_type != LM_TYPE_INODE)
delay = 0;
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
- gfs2_glock_put(gl);
+ __gfs2_glock_queue_work(gl, delay);
}
- if (drop_ref)
- gfs2_glock_put(gl);
+
+ /*
+ * Drop the remaining glock references manually here. (Mind that
+ * __gfs2_glock_queue_work depends on the lockref spinlock begin held
+ * here as well.)
+ */
+ gl->gl_lockref.count -= drop_refs;
+ if (!gl->gl_lockref.count) {
+ __gfs2_glock_put(gl);
+ return;
+ }
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -986,8 +1026,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gl->gl_lockref.count++;
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gl->gl_lockref.count--;
+ __gfs2_glock_queue_work(gl, 0);
}
run_queue(gl, 1);
spin_unlock(&gl->gl_lockref.lock);
@@ -1047,17 +1086,15 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
+ if (unlikely(!fast_path)) {
+ gl->gl_lockref.count++;
+ if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+ gl->gl_name.ln_type == LM_TYPE_INODE)
+ delay = gl->gl_hold_time;
+ __gfs2_glock_queue_work(gl, delay);
+ }
spin_unlock(&gl->gl_lockref.lock);
- if (likely(fast_path))
- return;
-
- gfs2_glock_hold(gl);
- if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
- !test_bit(GLF_DEMOTE, &gl->gl_flags) &&
- gl->gl_name.ln_type == LM_TYPE_INODE)
- delay = gl->gl_hold_time;
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
- gfs2_glock_put(gl);
}
void gfs2_glock_dq_wait(struct gfs2_holder *gh)
@@ -1233,9 +1270,8 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
+ __gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
- gfs2_glock_put(gl);
}
/**
@@ -1294,10 +1330,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ __gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
-
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
}
static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1355,8 +1389,7 @@ add_back_to_lru:
if (demote_ok(gl))
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gl->gl_lockref.count--;
+ __gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
@@ -1462,13 +1495,12 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
static void thaw_glock(struct gfs2_glock *gl)
{
- if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
- goto out;
- set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) {
-out:
+ if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) {
gfs2_glock_put(gl);
+ return;
}
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ gfs2_glock_queue_work(gl, 0);
}
/**
@@ -1484,9 +1516,8 @@ static void clear_glock(struct gfs2_glock *gl)
spin_lock(&gl->gl_lockref.lock);
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ __gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
}
/**
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index ab1ef322f7a5..9ad4a6ac6c84 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -257,4 +257,11 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh)
return gh->gh_gl;
}
+static inline void glock_set_object(struct gfs2_glock *gl, void *object)
+{
+ spin_lock(&gl->gl_lockref.lock);
+ gl->gl_object = object;
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 5db59d444838..5e69636d4dd3 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -137,7 +137,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
*
* Called when demoting or unlocking an EX glock. We must flush
* to disk all dirty buffers/pages relating to this glock, and must not
- * not return to caller to demote/unlock the glock until I/O is complete.
+ * return to caller to demote/unlock the glock until I/O is complete.
*/
static void rgrp_go_sync(struct gfs2_glock *gl)
@@ -184,7 +184,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
- struct gfs2_rgrpd *rgd = gl->gl_object;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
if (rgd)
gfs2_rgrp_brelse(rgd);
@@ -197,6 +197,38 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
}
+static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip;
+
+ spin_lock(&gl->gl_lockref.lock);
+ ip = gl->gl_object;
+ if (ip)
+ set_bit(GIF_GLOP_PENDING, &ip->i_flags);
+ spin_unlock(&gl->gl_lockref.lock);
+ return ip;
+}
+
+struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl)
+{
+ struct gfs2_rgrpd *rgd;
+
+ spin_lock(&gl->gl_lockref.lock);
+ rgd = gl->gl_object;
+ spin_unlock(&gl->gl_lockref.lock);
+
+ return rgd;
+}
+
+static void gfs2_clear_glop_pending(struct gfs2_inode *ip)
+{
+ if (!ip)
+ return;
+
+ clear_bit_unlock(GIF_GLOP_PENDING, &ip->i_flags);
+ wake_up_bit(&ip->i_flags, GIF_GLOP_PENDING);
+}
+
/**
* inode_go_sync - Sync the dirty data and/or metadata for an inode glock
* @gl: the glock protecting the inode
@@ -205,25 +237,24 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
static void inode_go_sync(struct gfs2_glock *gl)
{
- struct gfs2_inode *ip = gl->gl_object;
+ struct gfs2_inode *ip = gfs2_glock2inode(gl);
+ int isreg = ip && S_ISREG(ip->i_inode.i_mode);
struct address_space *metamapping = gfs2_glock2aspace(gl);
int error;
- if (ip && !S_ISREG(ip->i_inode.i_mode))
- ip = NULL;
- if (ip) {
+ if (isreg) {
if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
inode_dio_wait(&ip->i_inode);
}
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
- return;
+ goto out;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
filemap_fdatawrite(metamapping);
- if (ip) {
+ if (isreg) {
struct address_space *mapping = ip->i_inode.i_mapping;
filemap_fdatawrite(mapping);
error = filemap_fdatawait(mapping);
@@ -238,6 +269,9 @@ static void inode_go_sync(struct gfs2_glock *gl)
*/
smp_mb__before_atomic();
clear_bit(GLF_DIRTY, &gl->gl_flags);
+
+out:
+ gfs2_clear_glop_pending(ip);
}
/**
@@ -253,7 +287,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
- struct gfs2_inode *ip = gl->gl_object;
+ struct gfs2_inode *ip = gfs2_glock2inode(gl);
gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
@@ -274,6 +308,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
}
if (ip && S_ISREG(ip->i_inode.i_mode))
truncate_inode_pages(ip->i_inode.i_mapping, 0);
+
+ gfs2_clear_glop_pending(ip);
}
/**
@@ -541,7 +577,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
*/
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
- struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+ struct gfs2_inode *ip = gl->gl_object;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b7cf65d13561..73fce76e67ee 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -336,7 +336,6 @@ enum {
};
struct gfs2_glock {
- struct hlist_bl_node gl_list;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
@@ -374,6 +373,7 @@ struct gfs2_glock {
loff_t end;
} gl_vm;
};
+ struct rcu_head gl_rcu;
struct rhash_head gl_node;
};
@@ -386,6 +386,7 @@ enum {
GIF_SW_PAGED = 3,
GIF_ORDERED = 4,
GIF_FREE_VFS_INODE = 5,
+ GIF_GLOP_PENDING = 6,
};
struct gfs2_inode {
@@ -815,13 +816,11 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
- int sd_log_error;
atomic_t sd_reserving_log;
wait_queue_head_t sd_reserving_log_wait;
unsigned int sd_log_flush_head;
- u64 sd_log_flush_wrapped;
spinlock_t sd_ail_lock;
struct list_head sd_ail1_list;
@@ -858,5 +857,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
preempt_enable();
}
+extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
+
#endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9f605ea4810c..acca501f8110 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -144,7 +144,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
if (unlikely(error))
goto fail;
- ip->i_gl->gl_object = ip;
+ flush_delayed_work(&ip->i_gl->gl_work);
+ glock_set_object(ip->i_gl, ip);
error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (unlikely(error))
@@ -173,8 +174,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail_put;
-
- ip->i_iopen_gh.gh_gl->gl_object = ip;
+ flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work);
+ glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
@@ -201,14 +202,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- ip->i_iopen_gh.gh_gl->gl_object = NULL;
+ glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
- ip->i_gl->gl_object = NULL;
+ glock_set_object(ip->i_gl, NULL);
fail:
iget_failed(inode);
return ERR_PTR(error);
@@ -607,6 +608,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (error)
goto fail;
+ gfs2_holder_mark_uninitialized(ghs + 1);
error = create_ok(dip, name, mode);
if (error)
@@ -705,7 +707,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_free_inode;
- ip->i_gl->gl_object = ip;
+ glock_set_object(ip->i_gl, ip);
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
if (error)
goto fail_free_inode;
@@ -731,7 +733,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock2;
- ip->i_iopen_gh.gh_gl->gl_object = ip;
+ glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
gfs2_set_iop(inode);
insert_inode_hash(inode);
@@ -778,7 +780,6 @@ fail_gunlock3:
fail_gunlock2:
if (io_gl)
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
- gfs2_glock_dq_uninit(ghs + 1);
fail_free_inode:
if (ip->i_gl)
gfs2_glock_put(ip->i_gl);
@@ -799,6 +800,8 @@ fail_gunlock:
&GFS2_I(inode)->i_flags);
iput(inode);
}
+ if (gfs2_holder_initialized(ghs + 1))
+ gfs2_glock_dq_uninit(ghs + 1);
fail:
return error;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f865b96374df..9a624f694400 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -659,7 +659,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
struct gfs2_log_header *lh;
unsigned int tail;
u32 hash;
- int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
+ int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
lh = page_address(page);
@@ -722,7 +722,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
sdp->sd_log_flush_head = sdp->sd_log_head;
- sdp->sd_log_flush_wrapped = 0;
tr = sdp->sd_log_tr;
if (tr) {
sdp->sd_log_tr = NULL;
@@ -775,7 +774,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
}
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
- sdp->sd_log_flush_wrapped = 0;
log_write_header(sdp, 0);
sdp->sd_log_head = sdp->sd_log_flush_head;
}
@@ -880,7 +878,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
sdp->sd_log_flush_head = sdp->sd_log_head;
- sdp->sd_log_flush_wrapped = 0;
log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index b1f9144b42c7..3010f9edd177 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -71,7 +71,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
{
struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct gfs2_rgrpd *rgd = gl->gl_object;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -134,10 +134,8 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp)
BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
(sdp->sd_log_flush_head != sdp->sd_log_head));
- if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+ if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks)
sdp->sd_log_flush_head = 0;
- sdp->sd_log_flush_wrapped = 1;
- }
}
static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
@@ -170,7 +168,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
*/
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
- int error)
+ blk_status_t error)
{
struct buffer_head *bh, *next;
struct page *page = bvec->bv_page;
@@ -182,7 +180,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
bh = bh->b_this_page;
do {
if (error)
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
unlock_buffer(bh);
next = bh->b_this_page;
size -= bh->b_size;
@@ -209,15 +207,13 @@ static void gfs2_end_log_write(struct bio *bio)
struct page *page;
int i;
- if (bio->bi_error) {
- sdp->sd_log_error = bio->bi_error;
- fs_err(sdp, "Error %d writing to log\n", bio->bi_error);
- }
+ if (bio->bi_status)
+ fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
if (page_has_buffers(page))
- gfs2_end_log_write_bh(sdp, bvec, bio->bi_error);
+ gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
else
mempool_free(page, gfs2_page_pool);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 67d1fc4668f7..0a89e6f7a314 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,7 +52,6 @@ static void gfs2_init_glock_once(void *foo)
{
struct gfs2_glock *gl = foo;
- INIT_HLIST_BL_NODE(&gl->gl_list);
spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 663ffc135ef3..fabe1614f879 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio)
do {
struct buffer_head *next = bh->b_this_page;
len -= bh->b_size;
- bh->b_end_io(bh, !bio->bi_error);
+ bh->b_end_io(bh, !bio->bi_status);
bh = next;
} while (bh && len);
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ed67548b286c..e76058d34b74 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio)
{
struct page *page = bio->bi_private;
- if (!bio->bi_error)
+ if (!bio->bi_status)
SetPageUptodate(page);
else
- pr_warn("error %d reading superblock\n", bio->bi_error);
+ pr_warn("error %d reading superblock\n", bio->bi_status);
unlock_page(page);
}
@@ -203,7 +203,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
- memcpy(s->s_uuid, str->sb_uuid, 16);
+ memcpy(&s->s_uuid, str->sb_uuid, 16);
}
/**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 83c9909ff14a..836e38ba5d0a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -705,9 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- spin_lock(&gl->gl_lockref.lock);
- gl->gl_object = NULL;
- spin_unlock(&gl->gl_lockref.lock);
+ glock_set_object(gl, NULL);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -917,7 +915,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
error = rgd_insert(rgd);
spin_unlock(&sdp->sd_rindex_spin);
if (!error) {
- rgd->rd_gl->gl_object = rgd;
+ glock_set_object(rgd->rd_gl, rgd);
rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
rgd->rd_length) * bsize) - 1;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 29b0473f6e74..fdedec379b78 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1105,9 +1105,12 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
gfs2_holder_uninit(gh);
error = err;
} else {
- if (!error)
- error = statfs_slow_fill(
- gh->gh_gl->gl_object, sc);
+ if (!error) {
+ struct gfs2_rgrpd *rgd =
+ gfs2_glock2rgrp(gh->gh_gl);
+
+ error = statfs_slow_fill(rgd, sc);
+ }
gfs2_glock_dq_uninit(gh);
}
}
@@ -1535,6 +1538,12 @@ static void gfs2_evict_inode(struct inode *inode)
if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
goto out;
+ if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
+ BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
+ gfs2_holder_mark_uninitialized(&gh);
+ goto alloc_failed;
+ }
+
/* Must not read inode block until block type has been verified */
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
@@ -1543,11 +1552,9 @@ static void gfs2_evict_inode(struct inode *inode)
goto out;
}
- if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
- error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
- if (error)
- goto out_truncate;
- }
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
if (test_bit(GIF_INVALID, &ip->i_flags)) {
error = gfs2_inode_refresh(ip);
@@ -1555,6 +1562,7 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_truncate;
}
+alloc_failed:
if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
@@ -1621,7 +1629,8 @@ out_unlock:
}
gfs2_holder_uninit(&ip->i_iopen_gh);
}
- gfs2_glock_dq_uninit(&gh);
+ if (gfs2_holder_initialized(&gh))
+ gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED && error != -EROFS)
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
@@ -1631,13 +1640,13 @@ out:
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
- ip->i_gl->gl_object = NULL;
- flush_delayed_work(&ip->i_gl->gl_work);
+ glock_set_object(ip->i_gl, NULL);
+ wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put(ip->i_gl);
ip->i_gl = NULL;
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
- ip->i_iopen_gh.gh_gl->gl_object = NULL;
+ glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7a515345610c..ca1f97ff898c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -71,25 +71,14 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
}
-static int gfs2_uuid_valid(const u8 *uuid)
-{
- int i;
-
- for (i = 0; i < 16; i++) {
- if (uuid[i])
- return 1;
- }
- return 0;
-}
-
static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
{
struct super_block *s = sdp->sd_vfs;
- const u8 *uuid = s->s_uuid;
+
buf[0] = '\0';
- if (!gfs2_uuid_valid(uuid))
+ if (uuid_is_null(&s->s_uuid))
return 0;
- return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
+ return snprintf(buf, PAGE_SIZE, "%pUB\n", &s->s_uuid);
}
static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
@@ -637,12 +626,12 @@ static struct attribute *tune_attrs[] = {
NULL,
};
-static struct attribute_group tune_group = {
+static const struct attribute_group tune_group = {
.name = "tune",
.attrs = tune_attrs,
};
-static struct attribute_group lock_module_group = {
+static const struct attribute_group lock_module_group = {
.name = "lock_module",
.attrs = lock_module_attrs,
};
@@ -712,14 +701,13 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
{
struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
struct super_block *s = sdp->sd_vfs;
- const u8 *uuid = s->s_uuid;
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
- if (gfs2_uuid_valid(uuid))
- add_uevent_var(env, "UUID=%pUB", uuid);
+ if (!uuid_is_null(&s->s_uuid))
+ add_uevent_var(env, "UUID=%pUB", &s->s_uuid);
return 0;
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index d87721aeb575..54179554c7d2 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1327,8 +1327,8 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd;
- rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
+
rg_blocks += rgd->rd_length;
}
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index e33a0d36a93e..5d0182654580 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode)
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = pagecache_write_begin(NULL, mapping, size+1, 0,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
+ &page, &fsdata);
if (!res) {
res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
page, fsdata);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index feca524ce2a5..a3eb640b4f8f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode)
void *fsdata;
loff_t size = inode->i_size;
- res = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size, 0, 0,
+ &page, &fsdata);
if (res)
return;
res = pagecache_write_end(NULL, mapping, size,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dde861387a40..d44f5456eb9b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -200,7 +200,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/fs/inode.c b/fs/inode.c
index 131b2bcebc48..50370599e371 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file)
}
/**
- * inode_init_always - perform inode structure intialisation
+ * inode_init_always - perform inode structure initialisation
* @sb: superblock inode belongs to
* @inode: inode to initialise
*
@@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
+ inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
@@ -402,6 +403,8 @@ static void inode_lru_list_add(struct inode *inode)
{
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
+ else
+ inode->i_state |= I_REFERENCED;
}
/*
@@ -1489,7 +1492,6 @@ static void iput_final(struct inode *inode)
drop = generic_drop_inode(inode);
if (!drop && (sb->s_flags & MS_ACTIVE)) {
- inode->i_state |= I_REFERENCED;
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
@@ -1890,11 +1892,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
schedule();
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
spin_lock(&inode_hash_lock);
}
@@ -1913,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- unsigned int loop;
-
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
@@ -1926,20 +1926,15 @@ void __init inode_init_early(void)
sizeof(struct hlist_head),
ihash_entries,
14,
- HASH_EARLY,
+ HASH_EARLY | HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << i_hash_shift); loop++)
- INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void __init inode_init(void)
{
- unsigned int loop;
-
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
sizeof(struct inode),
@@ -1957,14 +1952,11 @@ void __init inode_init(void)
sizeof(struct hlist_head),
ihash_entries,
14,
- 0,
+ HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << i_hash_shift); loop++)
- INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
@@ -2022,7 +2014,7 @@ bool inode_owner_or_capable(const struct inode *inode)
return true;
ns = current_user_ns();
- if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid))
+ if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
@@ -2037,11 +2029,11 @@ static void __inode_dio_wait(struct inode *inode)
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
do {
- prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
if (atomic_read(&inode->i_dio_count))
schedule();
} while (atomic_read(&inode->i_dio_count));
- finish_wait(wq, &q.wait);
+ finish_wait(wq, &q.wq_entry);
}
/**
diff --git a/fs/internal.h b/fs/internal.h
index 076751d90ba2..9676fe11c093 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -126,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
return __atime_needs_update(path, inode, true);
}
-extern bool atime_needs_update_rcu(const struct path *, struct inode *);
-
/*
* fs-writeback.c
*/
diff --git a/fs/iomap.c b/fs/iomap.c
index 1faabe09b8fd..173222863aca 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ssize_t written = 0;
unsigned int flags = AOP_FLAG_NOFS;
- /*
- * Copies from kernel address space cannot fail (NFSD is a big user).
- */
- if (!iter_is_iovec(i))
- flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
@@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return PTR_ERR(rpage);
status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
- &page, iomap);
+ AOP_FLAG_NOFS, &page, iomap);
put_page(rpage);
if (unlikely(status))
return status;
@@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
struct page *page;
int status;
- status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
+ iomap);
if (status)
return status;
@@ -591,6 +584,100 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+static loff_t
+iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ switch (iomap->type) {
+ case IOMAP_UNWRITTEN:
+ offset = page_cache_seek_hole_data(inode, offset, length,
+ SEEK_HOLE);
+ if (offset < 0)
+ return length;
+ /* fall through */
+ case IOMAP_HOLE:
+ *(loff_t *)data = offset;
+ return 0;
+ default:
+ return length;
+ }
+}
+
+loff_t
+iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+{
+ loff_t size = i_size_read(inode);
+ loff_t length = size - offset;
+ loff_t ret;
+
+ /* Nothing to be found beyond the end of the file. */
+ if (offset >= size)
+ return -ENXIO;
+
+ while (length > 0) {
+ ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
+ &offset, iomap_seek_hole_actor);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ break;
+
+ offset += ret;
+ length -= ret;
+ }
+
+ return offset;
+}
+EXPORT_SYMBOL_GPL(iomap_seek_hole);
+
+static loff_t
+iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ return length;
+ case IOMAP_UNWRITTEN:
+ offset = page_cache_seek_hole_data(inode, offset, length,
+ SEEK_DATA);
+ if (offset < 0)
+ return length;
+ /*FALLTHRU*/
+ default:
+ *(loff_t *)data = offset;
+ return 0;
+ }
+}
+
+loff_t
+iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+{
+ loff_t size = i_size_read(inode);
+ loff_t length = size - offset;
+ loff_t ret;
+
+ /* Nothing to be found beyond the end of the file. */
+ if (offset >= size)
+ return -ENXIO;
+
+ while (length > 0) {
+ ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
+ &offset, iomap_seek_data_actor);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ break;
+
+ offset += ret;
+ length -= ret;
+ }
+
+ if (length <= 0)
+ return -ENXIO;
+ return offset;
+}
+EXPORT_SYMBOL_GPL(iomap_seek_data);
+
/*
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
@@ -679,8 +766,8 @@ static void iomap_dio_bio_end_io(struct bio *bio)
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
- if (bio->bi_error)
- iomap_dio_set_error(dio, bio->bi_error);
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (atomic_dec_and_test(&dio->ref)) {
if (is_sync_kiocb(dio->iocb)) {
@@ -800,6 +887,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio->bi_bdev = iomap->bdev;
bio->bi_iter.bi_sector =
iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -888,6 +976,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
flags |= IOMAP_WRITE;
}
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, start, end)) {
+ ret = -EAGAIN;
+ goto out_free_dio;
+ }
+ flags |= IOMAP_NOWAIT;
+ }
+
ret = filemap_write_and_wait_range(mapping, start, end);
if (ret)
goto out_free_dio;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b6b194ec1b4f..3c1c31321d9b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -263,18 +263,10 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
- if (err) {
- /*
- * Because AS_EIO is cleared by
- * filemap_fdatawait_range(), set it again so
- * that user process can get -EIO from fsync().
- */
- mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO);
-
- if (!ret)
- ret = err;
- }
+ err = filemap_fdatawait_keep_errors(
+ jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5a0245e36240..7d5ef3bf3f3e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void)
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
- SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
+ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
NULL); /* ctor */
retval = 0;
if (!jbd2_journal_head_cache) {
@@ -2579,10 +2579,10 @@ restart:
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&journal->j_list_lock);
schedule();
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
goto restart;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 9ee4832b6f8b..8b08044b3120 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -409,25 +409,6 @@ static handle_t *new_handle(int nblocks)
return handle;
}
-/**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
- * @journal: Journal to start transaction on.
- * @nblocks: number of block buffer we might modify
- *
- * We make sure that the transaction can guarantee at least nblocks of
- * modified buffers in the log. We block until the log can guarantee
- * that much space. Additionally, if rsv_blocks > 0, we also create another
- * handle with rsv_blocks reserved blocks in the journal. This handle is
- * is stored in h_rsv_handle. It is not attached to any particular transaction
- * and thus doesn't block transaction commit. If the caller uses this reserved
- * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
- * on the parent handle will dispose the reserved one. Reserved handle has to
- * be converted to a normal handle using jbd2_journal_start_reserved() before
- * it can be used.
- *
- * Return a pointer to a newly allocated handle, or an ERR_PTR() value
- * on failure.
- */
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
gfp_t gfp_mask, unsigned int type,
unsigned int line_no)
@@ -478,6 +459,25 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
EXPORT_SYMBOL(jbd2__journal_start);
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log. We block until the log can guarantee
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
+ *
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
+ */
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
@@ -680,6 +680,12 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
handle->h_buffer_credits = nblocks;
+ /*
+ * Restore the original nofs context because the journal restart
+ * is basically the same thing as journal stop and start.
+ * start_this_handle will start a new nofs context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
ret = start_this_handle(journal, handle, gfp_mask);
return ret;
}
@@ -1066,10 +1072,10 @@ out:
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
*
- * Returns an error code or 0 on success.
+ * Returns: error code or 0 on success.
*
* In full data journalling mode the buffer may be of type BJ_AsyncData,
- * because we're write()ing a buffer which is also part of a shared mapping.
+ * because we're ``write()ing`` a buffer which is also part of a shared mapping.
*/
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 06a71dbd4833..389ea53ea487 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1366,7 +1366,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
jffs2_add_ino_cache(c, f->inocache);
}
if (!f->inocache) {
- JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino);
+ JFFS2_ERROR("requested to read a nonexistent ino %u\n", ino);
return -ENOENT;
}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index bb1da1feafeb..a21f0e9eecd4 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio)
bp->l_flag |= lbmDONE;
- if (bio->bi_error) {
+ if (bio->bi_status) {
bp->l_flag |= lbmERROR;
jfs_err("lbmIODone: I/O error in JFS log");
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 489aaa1403e5..65120a471729 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio)
{
struct page *page = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_status) {
printk(KERN_ERR "metapage_read_end_io: I/O error\n");
SetPageError(page);
}
@@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio)
BUG_ON(!PagePrivate(page));
- if (bio->bi_error) {
+ if (bio->bi_status) {
printk(KERN_ERR "metapage_write_end_io: I/O error\n");
SetPageError(page);
}
@@ -664,6 +664,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
INCREMENT(mpStat.pagealloc);
mp = alloc_metapage(GFP_NOFS);
mp->page = page;
+ mp->sb = inode->i_sb;
mp->flag = 0;
mp->xflag = COMMIT_PAGE;
mp->count = 1;
@@ -711,7 +712,8 @@ void force_metapage(struct metapage *mp)
get_page(page);
lock_page(page);
set_page_dirty(page);
- write_one_page(page, 1);
+ if (write_one_page(page))
+ jfs_error(mp->sb, "write_one_page() failed\n");
clear_bit(META_forcewrite, &mp->flag);
put_page(page);
}
@@ -756,7 +758,8 @@ void release_metapage(struct metapage * mp)
set_page_dirty(page);
if (test_bit(META_sync, &mp->flag)) {
clear_bit(META_sync, &mp->flag);
- write_one_page(page, 1);
+ if (write_one_page(page))
+ jfs_error(mp->sb, "write_one_page() failed\n");
lock_page(page); /* write_one_page unlocks the page */
}
} else if (mp->lsn) /* discard_metapage doesn't remove it */
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index a869fb4a20d6..8b0ee514eb84 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -38,6 +38,7 @@ struct metapage {
/* implementation */
struct page *page;
+ struct super_block *sb;
unsigned int logical_size;
/* Journal management */
diff --git a/fs/libfs.c b/fs/libfs.c
index a8b62e5d43a9..3aabe553fc45 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end);
* to pass it an appropriate max_reserved value to avoid collisions.
*/
int simple_fill_super(struct super_block *s, unsigned long magic,
- struct tree_descr *files)
+ const struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
@@ -974,7 +974,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
int err;
int ret;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(file, start, end);
if (err)
return err;
@@ -991,6 +991,10 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
out:
inode_unlock(inode);
+ /* check and advance again to catch errors after syncing out buffers */
+ err = file_check_and_advance_wb_err(file);
+ if (ret == 0)
+ ret = err;
return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 41e491b8e5d7..27d577dbe51a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
goto out_nobind;
+ host->h_nlmclnt_ops = nlm_init->nlmclnt_ops;
return host;
out_nobind:
nlmclnt_release_host(host);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 112952037933..066ac313ae5c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
* @host: address of a valid nlm_host context representing the NLM server
* @cmd: fcntl-style file lock operation to perform
* @fl: address of arguments for the lock operation
+ * @data: address of data to be sent to callback operations
*
*/
-int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data)
{
struct nlm_rqst *call;
int status;
+ const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops;
call = nlm_alloc_call(host);
if (call == NULL)
return -ENOMEM;
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call)
+ nlmclnt_ops->nlmclnt_alloc_call(data);
+
nlmclnt_locks_init_private(fl, host);
if (!fl->fl_u.nfs_fl.owner) {
/* lockowner allocation has failed */
@@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
}
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
+ call->a_callback_data = data;
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
if (fl->fl_type != F_UNLCK) {
@@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
void nlmclnt_release_call(struct nlm_rqst *call)
{
+ const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
+
if (!atomic_dec_and_test(&call->a_count))
return;
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
+ nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
nlmclnt_release_host(call->a_host);
nlmclnt_release_lockargs(call);
kfree(call);
@@ -687,6 +697,19 @@ out:
return status;
}
+static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nlm_rqst *req = data;
+ const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops;
+ bool defer_call = false;
+
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare)
+ defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data);
+
+ if (!defer_call)
+ rpc_call_start(task);
+}
+
static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
{
struct nlm_rqst *req = data;
@@ -720,6 +743,7 @@ die:
}
static const struct rpc_call_ops nlmclnt_unlock_ops = {
+ .rpc_call_prepare = nlmclnt_unlock_prepare,
.rpc_call_done = nlmclnt_unlock_callback,
.rpc_release = nlmclnt_rpc_release,
};
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e7c8b9c76e48..5d481e8a1b5d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -132,6 +132,8 @@ lockd(void *vrqstp)
{
int err = 0;
struct svc_rqst *rqstp = vrqstp;
+ struct net *net = &init_net;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -176,6 +178,8 @@ lockd(void *vrqstp)
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
return 0;
}
@@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
- cancel_delayed_work_sync(&ln->grace_period_end);
- locks_end_grace(&ln->lockd_manager);
svc_shutdown_net(serv, net);
dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 5581e020644b..3507c80d1d4b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
if (!(block = nlmsvc_find_block(cookie)))
return;
- if (block) {
- if (status == nlm_lck_denied_grace_period) {
- /* Try again in a couple of seconds */
- nlmsvc_insert_block(block, 10 * HZ);
- } else {
- /* Lock is now held by client, or has been rejected.
- * In both cases, the block should be removed. */
- nlmsvc_unlink_block(block);
- }
+ if (status == nlm_lck_denied_grace_period) {
+ /* Try again in a couple of seconds */
+ nlmsvc_insert_block(block, 10 * HZ);
+ } else {
+ /*
+ * Lock is now held by client, or has been rejected.
+ * In both cases, the block should be removed.
+ */
+ nlmsvc_unlink_block(block);
}
nlmsvc_release_block(block);
}
diff --git a/fs/locks.c b/fs/locks.c
index 26811321d39b..afefeb4ad6de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1858,8 +1858,8 @@ EXPORT_SYMBOL(generic_setlease);
*
* Call this to establish a lease on the file. The "lease" argument is not
* used for F_UNLCK requests and may be NULL. For commands that set or alter
- * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set;
- * if not, this function will return -ENOLCK (and generate a scary-looking
+ * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
+ * set; if not, this function will return -ENOLCK (and generate a scary-looking
* stack trace).
*
* The "priv" pointer is passed directly to the lm_setup function as-is. It
@@ -1972,15 +1972,13 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* @cmd: the type of lock to apply.
*
* Apply a %FL_FLOCK style lock to an open file descriptor.
- * The @cmd can be one of
+ * The @cmd can be one of:
*
- * %LOCK_SH -- a shared lock.
- *
- * %LOCK_EX -- an exclusive lock.
- *
- * %LOCK_UN -- remove an existing lock.
- *
- * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes.
+ * - %LOCK_SH -- a shared lock.
+ * - %LOCK_EX -- an exclusive lock.
+ * - %LOCK_UN -- remove an existing lock.
+ * - %LOCK_MAND -- a 'mandatory' flock.
+ * This exists to emulate Windows Share Modes.
*
* %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
* processes read and write access respectively.
@@ -2086,26 +2084,22 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
-int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
+int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
struct file_lock file_lock;
- struct flock flock;
int error;
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
error = -EINVAL;
- if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+ if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
- error = flock_to_posix_lock(filp, &file_lock, &flock);
+ error = flock_to_posix_lock(filp, &file_lock, flock);
if (error)
goto out;
if (cmd == F_OFD_GETLK) {
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_GETLK;
@@ -2117,15 +2111,12 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
if (error)
goto out;
- flock.l_type = file_lock.fl_type;
+ flock->l_type = file_lock.fl_type;
if (file_lock.fl_type != F_UNLCK) {
- error = posix_lock_to_flock(&flock, &file_lock);
+ error = posix_lock_to_flock(flock, &file_lock);
if (error)
goto rel_priv;
}
- error = -EFAULT;
- if (!copy_to_user(l, &flock, sizeof(flock)))
- error = 0;
rel_priv:
locks_release_private(&file_lock);
out:
@@ -2218,26 +2209,16 @@ check_fmode_for_setlk(struct file_lock *fl)
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock __user *l)
+ struct flock *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock flock;
- struct inode *inode;
+ struct inode *inode = locks_inode(filp);
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
- inode = locks_inode(filp);
-
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
@@ -2246,7 +2227,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
- error = flock_to_posix_lock(filp, file_lock, &flock);
+ error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2261,7 +2242,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
switch (cmd) {
case F_OFD_SETLK:
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_SETLK;
@@ -2270,7 +2251,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
break;
case F_OFD_SETLKW:
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_SETLKW;
@@ -2315,26 +2296,22 @@ out:
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
-int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
+int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
struct file_lock file_lock;
- struct flock64 flock;
int error;
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
error = -EINVAL;
- if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+ if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
- error = flock64_to_posix_lock(filp, &file_lock, &flock);
+ error = flock64_to_posix_lock(filp, &file_lock, flock);
if (error)
goto out;
if (cmd == F_OFD_GETLK) {
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_GETLK64;
@@ -2346,13 +2323,9 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
if (error)
goto out;
- flock.l_type = file_lock.fl_type;
+ flock->l_type = file_lock.fl_type;
if (file_lock.fl_type != F_UNLCK)
- posix_lock_to_flock64(&flock, &file_lock);
-
- error = -EFAULT;
- if (!copy_to_user(l, &flock, sizeof(flock)))
- error = 0;
+ posix_lock_to_flock64(flock, &file_lock);
locks_release_private(&file_lock);
out:
@@ -2363,26 +2336,16 @@ out:
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock64 __user *l)
+ struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock64 flock;
- struct inode *inode;
+ struct inode *inode = locks_inode(filp);
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
- inode = locks_inode(filp);
-
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
@@ -2391,7 +2354,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
- error = flock64_to_posix_lock(filp, file_lock, &flock);
+ error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2406,7 +2369,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
switch (cmd) {
case F_OFD_SETLK:
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_SETLK64;
@@ -2415,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
break;
case F_OFD_SETLKW:
error = -EINVAL;
- if (flock.l_pid != 0)
+ if (flock->l_pid != 0)
goto out;
cmd = F_SETLKW64;
@@ -2504,7 +2467,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
.fl_owner = filp,
.fl_pid = current->tgid,
.fl_file = filp,
- .fl_flags = FL_FLOCK,
+ .fl_flags = FL_FLOCK | FL_CLOSE,
.fl_type = F_UNLCK,
.fl_end = OFFSET_MAX,
};
diff --git a/fs/mbcache.c b/fs/mbcache.c
index b19be429d655..d818fd236787 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -10,13 +10,14 @@
/*
* Mbcache is a simple key-value store. Keys need not be unique, however
* key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete_block()).
+ * mb_cache_entry_delete()).
*
* Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
- * They use hash of a block contents as a key and block number as a value.
- * That's why keys need not be unique (different xattr blocks may end up having
- * the same hash). However block number always uniquely identifies a cache
- * entry.
+ * Ext4 also uses it for deduplication of xattr values stored in inodes.
+ * They use hash of data as a key and provide a value that may represent a
+ * block or inode number. That's why keys need not be unique (hash of different
+ * data may be the same). However user provided value always uniquely
+ * identifies a cache entry.
*
* We provide functions for creation and removal of entries, search by key,
* and a special "delete entry with given key-value pair" operation. Fixed
@@ -62,15 +63,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
* @cache - cache where the entry should be created
* @mask - gfp mask with which the entry should be allocated
* @key - key of the entry
- * @block - block that contains data
- * @reusable - is the block reusable by other inodes?
+ * @value - value of the entry
+ * @reusable - is the entry reusable by others?
*
- * Creates entry in @cache with key @key and records that data is stored in
- * block @block. The function returns -EBUSY if entry with the same key
- * and for the same block already exists in cache. Otherwise 0 is returned.
+ * Creates entry in @cache with key @key and value @value. The function returns
+ * -EBUSY if entry with the same key and value already exists in cache.
+ * Otherwise 0 is returned.
*/
int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
- sector_t block, bool reusable)
+ u64 value, bool reusable)
{
struct mb_cache_entry *entry, *dup;
struct hlist_bl_node *dup_node;
@@ -91,12 +92,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
/* One ref for hash, one ref returned */
atomic_set(&entry->e_refcnt, 1);
entry->e_key = key;
- entry->e_block = block;
+ entry->e_value = value;
entry->e_reusable = reusable;
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
- if (dup->e_key == key && dup->e_block == block) {
+ if (dup->e_key == key && dup->e_value == value) {
hlist_bl_unlock(head);
kmem_cache_free(mb_entry_cache, entry);
return -EBUSY;
@@ -187,13 +188,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
EXPORT_SYMBOL(mb_cache_entry_find_next);
/*
- * mb_cache_entry_get - get a cache entry by block number (and key)
+ * mb_cache_entry_get - get a cache entry by value (and key)
* @cache - cache we work with
- * @key - key of block number @block
- * @block - block number
+ * @key - key
+ * @value - value
*/
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
- sector_t block)
+ u64 value)
{
struct hlist_bl_node *node;
struct hlist_bl_head *head;
@@ -202,7 +203,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_block == block) {
+ if (entry->e_key == key && entry->e_value == value) {
atomic_inc(&entry->e_refcnt);
goto out;
}
@@ -214,15 +215,14 @@ out:
}
EXPORT_SYMBOL(mb_cache_entry_get);
-/* mb_cache_entry_delete_block - remove information about block from cache
+/* mb_cache_entry_delete - remove a cache entry
* @cache - cache we work with
- * @key - key of block @block
- * @block - block number
+ * @key - key
+ * @value - value
*
- * Remove entry from cache @cache with key @key with data stored in @block.
+ * Remove entry from cache @cache with key @key and value @value.
*/
-void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
- sector_t block)
+void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
{
struct hlist_bl_node *node;
struct hlist_bl_head *head;
@@ -231,7 +231,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_block == block) {
+ if (entry->e_key == key && entry->e_value == value) {
/* We keep hash list reference to keep entry alive */
hlist_bl_del_init(&entry->e_hash_list);
hlist_bl_unlock(head);
@@ -248,7 +248,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
}
hlist_bl_unlock(head);
}
-EXPORT_SYMBOL(mb_cache_entry_delete_block);
+EXPORT_SYMBOL(mb_cache_entry_delete);
/* mb_cache_entry_touch - cache entry got used
* @cache - cache the entry belongs to
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 7edc9b395700..baa9721f1299 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -57,7 +57,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
mark_inode_dirty(dir);
}
if (IS_DIRSYNC(dir))
- err = write_one_page(page, 1);
+ err = write_one_page(page);
else
unlock_page(page);
return err;
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 4c57c9af6946..2d1ca08870f7 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -142,7 +142,7 @@ changed:
return -EAGAIN;
}
-static inline int get_block(struct inode * inode, sector_t block,
+static int get_block(struct inode * inode, sector_t block,
struct buffer_head *bh, int create)
{
int err = -EIO;
diff --git a/fs/mount.h b/fs/mount.h
index bf1fda6eed8f..de45d9e76748 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
+ struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
diff --git a/fs/mpage.c b/fs/mpage.c
index baff8f820c29..2e4c41ccb5c9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- page_endio(page, op_is_write(bio_op(bio)), bio->bi_error);
+ page_endio(page, op_is_write(bio_op(bio)),
+ blk_status_to_errno(bio->bi_status));
}
bio_put(bio);
@@ -344,6 +345,7 @@ confused:
*
* So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
* submitted in the following order:
+ *
* 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
*
* because the indirect block has to be read to get the mappings of blocks
@@ -614,6 +616,7 @@ alloc_new:
goto confused;
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index 9a7f8bd748d8..e0b46eb0e212 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1008,7 +1008,7 @@ static int may_linkat(struct path *link)
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
- if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
+ if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
return 0;
audit_log_link_denied("linkat", link);
@@ -2142,7 +2142,6 @@ OK:
static const char *path_init(struct nameidata *nd, unsigned flags)
{
- int retval = 0;
const char *s = nd->name->name;
if (!*s)
@@ -2154,13 +2153,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
- if (*s) {
- if (!d_can_lookup(root))
- return ERR_PTR(-ENOTDIR);
- retval = inode_permission(inode, MAY_EXEC);
- if (retval)
- return ERR_PTR(retval);
- }
+ if (*s && unlikely(!d_can_lookup(root)))
+ return ERR_PTR(-ENOTDIR);
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
@@ -2258,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd)
return walk_component(nd, 0);
}
+static int handle_lookup_down(struct nameidata *nd)
+{
+ struct path path = nd->path;
+ struct inode *inode = nd->inode;
+ unsigned seq = nd->seq;
+ int err;
+
+ if (nd->flags & LOOKUP_RCU) {
+ /*
+ * don't bother with unlazy_walk on failure - we are
+ * at the very beginning of walk, so we lose nothing
+ * if we simply redo everything in non-RCU mode
+ */
+ if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
+ return -ECHILD;
+ } else {
+ dget(path.dentry);
+ err = follow_managed(&path, nd);
+ if (unlikely(err < 0))
+ return err;
+ inode = d_backing_inode(path.dentry);
+ seq = 0;
+ }
+ path_to_nameidata(&path, nd);
+ nd->inode = inode;
+ nd->seq = seq;
+ return 0;
+}
+
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
@@ -2266,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
if (IS_ERR(s))
return PTR_ERR(s);
+
+ if (unlikely(flags & LOOKUP_DOWN)) {
+ err = handle_lookup_down(nd);
+ if (unlikely(err < 0)) {
+ terminate_walk(nd);
+ return err;
+ }
+ }
+
while (!(err = link_path_walk(s, nd))
&& ((err = lookup_last(nd)) > 0)) {
s = trailing_symlink(nd);
@@ -4300,6 +4332,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
+ *
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
* That's where 4.4 screws up. Current fix: serialization on
@@ -4330,11 +4363,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
int error;
bool is_dir = d_is_dir(old_dentry);
- const unsigned char *old_name;
struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
+ struct name_snapshot old_name;
if (source == target)
return 0;
@@ -4381,7 +4414,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+ take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
if (!is_dir || (flags & RENAME_EXCHANGE))
lock_two_nondirectories(source, target);
@@ -4436,14 +4469,14 @@ out:
inode_unlock(target);
dput(new_dentry);
if (!error) {
- fsnotify_move(old_dir, new_dir, old_name, is_dir,
+ fsnotify_move(old_dir, new_dir, old_name.name, is_dir,
!(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
if (flags & RENAME_EXCHANGE) {
fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
new_is_dir, NULL, new_dentry);
}
}
- fsnotify_oldname_free(old_name);
+ release_dentry_name_snapshot(&old_name);
return error;
}
@@ -4766,7 +4799,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+ unsigned int flags = 0;
if (nofs)
flags |= AOP_FLAG_NOFS;
diff --git a/fs/namespace.c b/fs/namespace.c
index b3b115bd4e1e..81f934b5d571 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
+ INIT_LIST_HEAD(&mnt->mnt_umounting);
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
return mnt;
@@ -3238,7 +3239,6 @@ static void __init init_mount_tree(void)
void __init mnt_init(void)
{
- unsigned u;
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
@@ -3247,22 +3247,17 @@ void __init mnt_init(void)
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
mhash_entries, 19,
- 0,
+ HASH_ZERO,
&m_hash_shift, &m_hash_mask, 0, 0);
mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
sizeof(struct hlist_head),
mphash_entries, 19,
- 0,
+ HASH_ZERO,
&mp_hash_shift, &mp_hash_mask, 0, 0);
if (!mount_hashtable || !mountpoint_hashtable)
panic("Failed to allocate mount hash table\n");
- for (u = 0; u <= m_hash_mask; u++)
- INIT_HLIST_HEAD(&mount_hashtable[u]);
- for (u = 0; u <= mp_hash_mask; u++)
- INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
-
kernfs_init();
err = sysfs_init();
@@ -3462,8 +3457,9 @@ static void mntns_put(struct ns_common *ns)
static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct fs_struct *fs = current->fs;
- struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
+ struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
struct path root;
+ int err;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
@@ -3474,15 +3470,20 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return -EINVAL;
get_mnt_ns(mnt_ns);
- put_mnt_ns(nsproxy->mnt_ns);
+ old_mnt_ns = nsproxy->mnt_ns;
nsproxy->mnt_ns = mnt_ns;
/* Find the root */
- root.mnt = &mnt_ns->root->mnt;
- root.dentry = mnt_ns->root->mnt.mnt_root;
- path_get(&root);
- while(d_mountpoint(root.dentry) && follow_down_one(&root))
- ;
+ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
+ "/", LOOKUP_DOWN, &root);
+ if (err) {
+ /* revert to old namespace */
+ nsproxy->mnt_ns = old_mnt_ns;
+ put_mnt_ns(mnt_ns);
+ return err;
+ }
+
+ put_mnt_ns(old_mnt_ns);
/* Update the pwd and root */
set_fs_pwd(fs, &root);
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 0c3905e0542e..6719c0be674d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf)
* -- nyc
*/
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
return VM_FAULT_MAJOR;
}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f31fd0dd92c6..69d02cf8cf37 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -123,11 +123,6 @@ config PNFS_BLOCK
depends on NFS_V4_1 && BLK_DEV_DM
default NFS_V4
-config PNFS_OBJLAYOUT
- tristate
- depends on NFS_V4_1 && SCSI_OSD_ULD
- default NFS_V4
-
config PNFS_FLEXFILE_LAYOUT
tristate
depends on NFS_V4_1 && NFS_V3
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6abdda209642..98f4e5728a67 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
-obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 0ca370d23ddb..d8863a804b15 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio)
{
struct parallel_io *par = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_status) {
struct nfs_pgio_header *header = par->data;
if (!header->pnfs_error)
@@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio)
struct parallel_io *par = bio->bi_private;
struct nfs_pgio_header *header = par->data;
- if (bio->bi_error) {
+ if (bio->bi_status) {
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 773774531aff..73a1f928226c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_should_stop()) {
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
/*
* Listen for a request on the socket
*/
@@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
return 0;
}
@@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_should_stop()) {
- if (try_to_freeze())
- continue;
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
@@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
- flush_signals(current);
}
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
return 0;
}
@@ -221,14 +229,14 @@ err_bind:
static struct svc_serv_ops nfs40_cb_sv_ops = {
.svo_function = nfs4_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads,
+ .svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
#if defined(CONFIG_NFS_V4_1)
static struct svc_serv_ops nfs41_cb_sv_ops = {
.svo_function = nfs41_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads,
+ .svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
@@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f073a6d2c6a5..52479f180ea1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -131,10 +131,11 @@ restart:
if (!inode)
continue;
if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_lock();
+ rcu_read_unlock();
spin_unlock(&clp->cl_lock);
iput(inode);
spin_lock(&clp->cl_lock);
+ rcu_read_lock();
goto restart;
}
return inode;
@@ -170,10 +171,11 @@ restart:
if (!inode)
continue;
if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_lock();
+ rcu_read_unlock();
spin_unlock(&clp->cl_lock);
iput(inode);
spin_lock(&clp->cl_lock);
+ rcu_read_lock();
goto restart;
}
return inode;
@@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
static u32 do_callback_layoutrecall(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
- u32 res;
-
- dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
if (args->cbl_recall_type == RETURN_FILE)
- res = initiate_file_draining(clp, args);
- else
- res = initiate_bulk_draining(clp, args);
- dprintk("%s returning %i\n", __func__, res);
- return res;
-
+ return initiate_file_draining(clp, args);
+ return initiate_bulk_draining(clp, args);
}
__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps)
{
- u32 res;
-
- dprintk("%s: -->\n", __func__);
+ u32 res = NFS4ERR_OP_NOT_IN_SESSION;
if (cps->clp)
res = do_callback_layoutrecall(cps->clp, args);
- else
- res = NFS4ERR_OP_NOT_IN_SESSION;
-
- dprintk("%s: exit with status = %d\n", __func__, res);
return cpu_to_be32(res);
}
@@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
struct nfs_client *clp = cps->clp;
struct nfs_server *server = NULL;
- dprintk("%s: -->\n", __func__);
-
if (!clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
@@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
goto found;
}
rcu_read_unlock();
- dprintk("%s: layout type %u not found\n",
- __func__, dev->cbd_layout_type);
continue;
}
@@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
out:
kfree(args->devs);
- dprintk("%s: exit with status = %u\n",
- __func__, be32_to_cpu(res));
return res;
}
@@ -417,16 +400,11 @@ static __be32
validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
const struct cb_sequenceargs * args)
{
- dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
- __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
-
if (args->csa_slotid > tbl->server_highest_slotid)
return htonl(NFS4ERR_BADSLOT);
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
- dprintk("%s seqid %u is a replay\n",
- __func__, args->csa_sequenceid);
if (nfs4_test_locked_slot(tbl, slot->slot_nr))
return htonl(NFS4ERR_DELAY);
/* Signal process_op to set this error on next op */
@@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp,
for (j = 0; j < rclist->rcl_nrefcalls; j++) {
ref = &rclist->rcl_refcalls[j];
-
- dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
- "slotid %u\n", __func__,
- ((u32 *)&rclist->rcl_sessionid.data)[0],
- ((u32 *)&rclist->rcl_sessionid.data)[1],
- ((u32 *)&rclist->rcl_sessionid.data)[2],
- ((u32 *)&rclist->rcl_sessionid.data)[3],
- ref->rc_sequenceid, ref->rc_slotid);
-
status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
ref->rc_sequenceid, HZ >> 1) < 0;
if (status)
@@ -593,8 +562,6 @@ out:
res->csr_status = status;
trace_nfs4_cb_sequence(args, res, status);
- dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
- ntohl(status), ntohl(res->csr_status));
return status;
}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d051fc3583a9..390ac9c39c59 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
}
hdr->nops = ntohl(*p);
- dprintk("%s: minorversion %d nops %d\n", __func__,
- hdr->minorversion, hdr->nops);
return 0;
}
@@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
- goto out;
- status = decode_bitmap(xdr, args->bitmap);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return status;
+ return decode_bitmap(xdr, args->bitmap);
}
static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
@@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
- goto out;
+ return status;
p = read_buf(xdr, 4);
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_RESOURCE);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
args->truncate = ntohl(*p);
- status = decode_fh(xdr, &args->fh);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return decode_fh(xdr, &args->fh);
}
#if defined(CONFIG_NFS_V4_1)
@@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
uint32_t iomode;
p = read_buf(xdr, 4 * sizeof(uint32_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
args->cbl_layout_type = ntohl(*p++);
/* Depite the spec's xdr, iomode really belongs in the FILE switch,
@@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
args->cbl_range.iomode = iomode;
status = decode_fh(xdr, &args->cbl_fh);
if (unlikely(status != 0))
- goto out;
+ return status;
p = read_buf(xdr, 2 * sizeof(uint64_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
- status = decode_layout_stateid(xdr, &args->cbl_stateid);
- if (unlikely(status != 0))
- goto out;
+ return decode_layout_stateid(xdr, &args->cbl_stateid);
} else if (args->cbl_recall_type == RETURN_FSID) {
p = read_buf(xdr, 2 * sizeof(uint64_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
- } else if (args->cbl_recall_type != RETURN_ALL) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
- dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
- __func__,
- args->cbl_layout_type, iomode,
- args->cbl_layoutchanged, args->cbl_recall_type);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ } else if (args->cbl_recall_type != RETURN_ALL)
+ return htonl(NFS4ERR_BADXDR);
+ return 0;
}
static
@@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
status = decode_sessionid(xdr, &args->csa_sessionid);
if (status)
- goto out;
+ return status;
- status = htonl(NFS4ERR_RESOURCE);
p = read_buf(xdr, 5 * sizeof(uint32_t));
if (unlikely(p == NULL))
- goto out;
+ return htonl(NFS4ERR_RESOURCE);
args->csa_addr = svc_addr(rqstp);
args->csa_sequenceid = ntohl(*p++);
@@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
sizeof(*args->csa_rclists),
GFP_KERNEL);
if (unlikely(args->csa_rclists == NULL))
- goto out;
+ return htonl(NFS4ERR_RESOURCE);
for (i = 0; i < args->csa_nrclists; i++) {
status = decode_rc_list(xdr, &args->csa_rclists[i]);
@@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
}
}
}
- status = 0;
-
- dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
- "highestslotid %u cachethis %d nrclists %u\n",
- __func__,
- ((u32 *)&args->csa_sessionid)[0],
- ((u32 *)&args->csa_sessionid)[1],
- ((u32 *)&args->csa_sessionid)[2],
- ((u32 *)&args->csa_sessionid)[3],
- args->csa_sequenceid, args->csa_slotid,
- args->csa_highestslotid, args->csa_cachethis,
- args->csa_nrclists);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return 0;
out_free:
for (i = 0; i < args->csa_nrclists; i++)
kfree(args->csa_rclists[i].rcl_refcalls);
kfree(args->csa_rclists);
- goto out;
+ return status;
}
static __be32 decode_recallany_args(struct svc_rqst *rqstp,
@@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream
status = decode_fh(xdr, &args->cbnl_fh);
if (unlikely(status != 0))
- goto out;
- status = decode_lockowner(xdr, args);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return status;
+ return decode_lockowner(xdr, args);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
}
@@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
__be32 status = res->csr_status;
if (unlikely(status != 0))
- goto out;
+ return status;
status = encode_sessionid(xdr, &res->csr_sessionid);
if (status)
- goto out;
+ return status;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
@@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
*p++ = htonl(res->csr_slotid);
*p++ = htonl(res->csr_highestslotid);
*p++ = htonl(res->csr_target_highestslotid);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return 0;
}
static __be32
@@ -800,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session,
* A single slot, so highest used slotid is either 0 or -1
*/
nfs4_free_slot(tbl, slot);
- nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
@@ -871,14 +823,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp,
long maxlen;
__be32 res;
- dprintk("%s: start\n", __func__);
status = decode_op_hdr(xdr_in, &op_nr);
if (unlikely(status))
return status;
- dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
- __func__, cps->minorversion, nop, op_nr);
-
switch (cps->minorversion) {
case 0:
status = preprocess_nfs4_op(op_nr, &op);
@@ -917,7 +865,6 @@ encode_hdr:
return res;
if (op->encode_res != NULL && status == 0)
status = op->encode_res(rqstp, xdr_out, resp);
- dprintk("%s: done, status = %d\n", __func__, ntohl(status));
return status;
}
@@ -937,8 +884,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
};
unsigned int nops = 0;
- dprintk("%s: start\n", __func__);
-
xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
@@ -977,7 +922,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
*hdr_res.nops = htonl(nops);
nfs4_cb_free_slot(&cps);
nfs_put_client(cps.clp);
- dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
out_invalidcred:
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 04d15a0045e3..ee5ddbd36088 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
static void pnfs_init_server(struct nfs_server *server)
{
rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+ rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
}
#else
@@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server)
*/
void nfs_free_client(struct nfs_client *clp)
{
- dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
-
nfs_fscache_release_client_cookie(clp);
/* -EIO all pending I/O */
@@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp)
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
kfree(clp);
-
- dprintk("<-- nfs_free_client()\n");
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp)
if (!clp)
return;
- dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
nn = net_generic(clp->cl_net, nfs_net_id);
if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
@@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
}
smp_rmb();
-
- dprintk("<-- %s found nfs_client %p for %s\n",
- __func__, clp, cl_init->hostname ?: "");
return clp;
}
@@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
return NULL;
}
- dprintk("--> nfs_get_client(%s,v%u)\n",
- cl_init->hostname, rpc_ops->version);
-
/* see if the client already exists */
do {
spin_lock(&nn->nfs_client_lock);
@@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
new = rpc_ops->alloc_client(cl_init);
} while (!IS_ERR(new));
- dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
- cl_init->hostname, PTR_ERR(new));
return new;
}
EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server)
.noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
1 : 0,
.net = clp->cl_net,
+ .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
};
if (nlm_init.nfs_version > 3)
@@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
{
int error;
- if (clp->cl_cons_state == NFS_CS_READY) {
- /* the client is already initialised */
- dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+ /* the client is already initialised */
+ if (clp->cl_cons_state == NFS_CS_READY)
return clp;
- }
/*
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
- if (error < 0)
- goto error;
- nfs_mark_client_ready(clp, NFS_CS_READY);
+ nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error);
+ if (error < 0) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(error);
+ }
return clp;
-
-error:
- nfs_mark_client_ready(clp, error);
- nfs_put_client(clp);
- dprintk("<-- nfs_init_client() = xerror %d\n", error);
- return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(nfs_init_client);
@@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server,
struct nfs_client *clp;
int error;
- dprintk("--> nfs_init_server()\n");
-
nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
data->timeo, data->retrans);
if (data->flags & NFS_MOUNT_NORESVPORT)
@@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server,
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
- if (IS_ERR(clp)) {
- dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+ if (IS_ERR(clp))
return PTR_ERR(clp);
- }
server->nfs_client = clp;
@@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server,
server->mountd_protocol = data->mount_server.protocol;
server->namelen = data->namlen;
- dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
return 0;
error:
server->nfs_client = NULL;
nfs_put_client(clp);
- dprintk("<-- nfs_init_server() = xerror %d\n", error);
return error;
}
@@ -798,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
struct nfs_client *clp = server->nfs_client;
int error;
- dprintk("--> nfs_probe_fsinfo()\n");
-
if (clp->rpc_ops->set_capabilities != NULL) {
error = clp->rpc_ops->set_capabilities(server, mntfh);
if (error < 0)
- goto out_error;
+ return error;
}
fsinfo.fattr = fattr;
@@ -811,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
if (error < 0)
- goto out_error;
+ return error;
nfs_server_set_fsinfo(server, &fsinfo);
@@ -826,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
server->namelen = pathinfo.max_namelen;
}
- dprintk("<-- nfs_probe_fsinfo() = 0\n");
return 0;
-
-out_error:
- dprintk("nfs_probe_fsinfo: error = %d\n", -error);
- return error;
}
EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
@@ -927,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server);
*/
void nfs_free_server(struct nfs_server *server)
{
- dprintk("--> nfs_free_server()\n");
-
nfs_server_remove_lists(server);
if (server->destroy != NULL)
@@ -946,7 +914,6 @@ void nfs_free_server(struct nfs_server *server)
nfs_free_iostats(server->io_stats);
kfree(server);
nfs_release_automount_timer();
- dprintk("<-- nfs_free_server()\n");
}
EXPORT_SYMBOL_GPL(nfs_free_server);
@@ -1026,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
struct nfs_fattr *fattr_fsinfo;
int error;
- dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
- (unsigned long long) fattr->fsid.major,
- (unsigned long long) fattr->fsid.minor);
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1061,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
- dprintk("Cloned FSID: %llx:%llx\n",
- (unsigned long long) server->fsid.major,
- (unsigned long long) server->fsid.minor);
-
error = nfs_start_lockd(server);
if (error < 0)
goto out_free_server;
@@ -1073,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
server->mount_time = jiffies;
nfs_free_fattr(fattr_fsinfo);
- dprintk("<-- nfs_clone_server() = %p\n", server);
return server;
out_free_server:
nfs_free_fattr(fattr_fsinfo);
nfs_free_server(server);
- dprintk("<-- nfs_clone_server() = error %d\n", error);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(nfs_clone_server);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f92ba8d6c556..2ac00bf4ecf1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.read = generic_read_dir,
- .iterate_shared = nfs_readdir,
+ .iterate = nfs_readdir,
.open = nfs_opendir,
.release = nfs_closedir,
.fsync = nfs_fsync_dir,
@@ -145,7 +145,6 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
- atomic_t refcount;
int size;
int eof_index;
u64 last_cookie;
@@ -171,27 +170,6 @@ typedef struct {
} nfs_readdir_descriptor_t;
/*
- * The caller is responsible for calling nfs_readdir_release_array(page)
- */
-static
-struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
-{
- void *ptr;
- if (page == NULL)
- return ERR_PTR(-EIO);
- ptr = kmap(page);
- if (ptr == NULL)
- return ERR_PTR(-ENOMEM);
- return ptr;
-}
-
-static
-void nfs_readdir_release_array(struct page *page)
-{
- kunmap(page);
-}
-
-/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
static
@@ -201,18 +179,9 @@ void nfs_readdir_clear_array(struct page *page)
int i;
array = kmap_atomic(page);
- if (atomic_dec_and_test(&array->refcount))
- for (i = 0; i < array->size; i++)
- kfree(array->array[i].string.name);
- kunmap_atomic(array);
-}
-
-static bool grab_page(struct page *page)
-{
- struct nfs_cache_array *array = kmap_atomic(page);
- bool res = atomic_inc_not_zero(&array->refcount);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
kunmap_atomic(array);
- return res;
}
/*
@@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
static
int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array = kmap(page);
struct nfs_cache_array_entry *cache_entry;
int ret;
- if (IS_ERR(array))
- return PTR_ERR(array);
-
cache_entry = &array->array[array->size];
/* Check that this entry lies within the page bounds */
@@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
if (entry->eof != 0)
array->eof_index = array->size;
out:
- nfs_readdir_release_array(page);
+ kunmap(page);
return ret;
}
@@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array;
int status;
- array = nfs_readdir_get_array(desc->page);
- if (IS_ERR(array)) {
- status = PTR_ERR(array);
- goto out;
- }
+ array = kmap(desc->page);
if (*desc->dir_cookie == 0)
status = nfs_readdir_search_for_pos(array, desc);
@@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
desc->current_index += array->size;
desc->page_index++;
}
- nfs_readdir_release_array(desc->page);
-out:
+ kunmap(desc->page);
return status;
}
@@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = nfs_readdir_get_array(page);
- if (!IS_ERR(array)) {
- array->eof_index = array->size;
- status = 0;
- nfs_readdir_release_array(page);
- } else
- status = PTR_ERR(array);
+ array = kmap(page);
+ array->eof_index = array->size;
+ status = 0;
+ kunmap(page);
}
put_page(scratch);
@@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
goto out;
}
- array = nfs_readdir_get_array(page);
- if (IS_ERR(array)) {
- status = PTR_ERR(array);
- goto out_label_free;
- }
+ array = kmap(page);
memset(array, 0, sizeof(struct nfs_cache_array));
- atomic_set(&array->refcount, 1);
array->eof_index = -1;
status = nfs_readdir_alloc_pages(pages, array_size);
@@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
nfs_readdir_free_pages(pages, array_size);
out_release_array:
- nfs_readdir_release_array(page);
-out_label_free:
+ kunmap(page);
nfs4_label_free(entry.label);
out:
nfs_free_fattr(entry.fattr);
@@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
- nfs_readdir_clear_array(desc->page);
+ if (!desc->page->mapping)
+ nfs_readdir_clear_array(desc->page);
put_page(desc->page);
desc->page = NULL;
}
@@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- struct page *page;
-
- for (;;) {
- page = read_cache_page(desc->file->f_mapping,
+ return read_cache_page(desc->file->f_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
- if (IS_ERR(page) || grab_page(page))
- break;
- put_page(page);
- }
- return page;
}
/*
@@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array = NULL;
struct nfs_open_dir_context *ctx = file->private_data;
- array = nfs_readdir_get_array(desc->page);
- if (IS_ERR(array)) {
- res = PTR_ERR(array);
- goto out;
- }
-
+ array = kmap(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
@@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
if (array->eof_index >= 0)
desc->eof = 1;
- nfs_readdir_release_array(desc->page);
-out:
+ kunmap(desc->page);
cache_page_release(desc);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
(unsigned long long)*desc->dir_cookie, res);
@@ -966,11 +905,13 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
{
+ struct inode *inode = file_inode(filp);
struct nfs_open_dir_context *dir_ctx = filp->private_data;
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
filp, offset, whence);
+ inode_lock(inode);
switch (whence) {
case 1:
offset += filp->f_pos;
@@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- return -EINVAL;
+ offset = -EINVAL;
+ goto out;
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
+out:
+ inode_unlock(inode);
return offset;
}
@@ -2002,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(nfs_link);
-static void
-nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data)
-{
- struct dentry *old_dentry = data->old_dentry;
- struct dentry *new_dentry = data->new_dentry;
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
-
- nfs_mark_for_revalidate(old_inode);
-
- switch (task->tk_status) {
- case 0:
- if (new_inode != NULL)
- nfs_drop_nlink(new_inode);
- d_move(old_dentry, new_dentry);
- nfs_set_verifier(new_dentry,
- nfs_save_change_attribute(data->new_dir));
- break;
- case -ENOENT:
- nfs_dentry_handle_enoent(old_dentry);
- }
-}
-
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2055,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct dentry *dentry = NULL;
+ struct dentry *dentry = NULL, *rehash = NULL;
struct rpc_task *task;
int error = -EBUSY;
@@ -2078,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* To prevent any new references to the target during the
* rename, we unhash the dentry in advance.
*/
- if (!d_unhashed(new_dentry))
+ if (!d_unhashed(new_dentry)) {
d_drop(new_dentry);
+ rehash = new_dentry;
+ }
if (d_count(new_dentry) > 2) {
int err;
@@ -2096,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
new_dentry = dentry;
+ rehash = NULL;
new_inode = NULL;
}
}
@@ -2104,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode != NULL)
NFS_PROTO(new_inode)->return_delegation(new_inode);
- task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
- nfs_complete_rename);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
@@ -2115,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error == 0)
error = task->tk_status;
rpc_put_task(task);
+ nfs_mark_for_revalidate(old_inode);
out:
+ if (rehash)
+ d_rehash(rehash);
trace_nfs_rename_exit(old_dir, old_dentry,
new_dir, new_dentry, error);
+ if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
+ /*
+ * The d_move() should be here instead of in an async RPC completion
+ * handler because we need the proper locks to move the dentry. If
+ * we're interrupted by a signal, the async RPC completion handler
+ * should mark the directories for revalidation.
+ */
+ d_move(old_dentry, new_dentry);
+ nfs_set_verifier(new_dentry,
+ nfs_save_change_attribute(new_dir));
+ } else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(old_dentry);
+
/* new dentry created? */
if (dentry)
dput(dentry);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c1b5fed7c863..6fb9fad2d1e6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
nfs_direct_req_release(dreq);
}
-static void nfs_direct_readpage_release(struct nfs_page *req)
-{
- dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
- req->wb_context->dentry->d_sb->s_id,
- (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
- req->wb_bytes,
- (long long)req_offset(req));
- nfs_release_request(req);
-}
-
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
{
unsigned long bytes = 0;
@@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
set_page_dirty(page);
bytes += req->wb_bytes;
nfs_list_remove_request(req);
- nfs_direct_readpage_release(req);
+ nfs_release_request(req);
}
out_put:
if (put_dreq(dreq))
@@ -700,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
int status = data->task.tk_status;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- if (status < 0) {
- dprintk("NFS: %5u commit failed with error %d.\n",
- data->task.tk_pid, status);
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
- dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
+ if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- }
- dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 668213984d68..5713eb32a45e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_launder_page(inode, page);
+ return nfs_wb_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
if (!IS_ERR(l_ctx)) {
status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
- if (status < 0)
+ /* NOTE: special case
+ * If we're signalled while cleaning up locks on process exit, we
+ * still need to complete the unlock.
+ */
+ if (status < 0 && !(fl->fl_flags & FL_CLOSE))
return status;
}
- /* NOTE: special case
- * If we're signalled while cleaning up locks on process exit, we
- * still need to complete the unlock.
- */
/*
* Use local locking if mounted with "-onolock" or with appropriate
* "-olocal_lock="
@@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
- /* We're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ /*
+ * VFS doesn't require the open mode to match a flock() lock's type.
+ * NFS, however, may simulate flock() locking with posix locking which
+ * requires the open mode to match the lock type.
+ */
+ switch (fl->fl_type) {
+ case F_UNLCK:
return do_unlk(filp, cmd, fl, is_local);
+ case F_RDLCK:
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+
return do_setlk(filp, cmd, fl, is_local);
}
EXPORT_SYMBOL_GPL(nfs_flock);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index acd30baca461..1cf85d65b748 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino,
fl = FILELAYOUT_LSEG(lseg);
status = filelayout_check_deviceid(lo, fl, gfp_flags);
- if (status)
+ if (status) {
+ pnfs_put_lseg(lseg);
lseg = ERR_PTR(status);
+ }
out:
- if (IS_ERR(lseg))
- pnfs_put_lseg(lseg);
return lseg;
}
@@ -933,6 +933,7 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
req->wb_context,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 42dedf2d625f..23542dc44a25 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -454,6 +454,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
/* fh */
+ rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
@@ -846,6 +847,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
retry:
+ pnfs_generic_pg_check_layout(pgio);
/* Use full layout for now */
if (!pgio->pg_lseg)
ff_layout_pg_get_read(pgio, req, false);
@@ -894,6 +896,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int status;
retry:
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -1800,16 +1803,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
if (!ds)
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
if (!ds_cred)
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1839,6 +1842,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
sync, RPC_TASK_SOFTCONN);
put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ return PNFS_NOT_ATTEMPTED;
}
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -2354,10 +2362,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
return 0;
}
+static int
+ff_layout_set_layoutdriver(struct nfs_server *server,
+ const struct nfs_fh *dummy)
+{
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+ server->caps |= NFS_CAP_LAYOUTSTATS;
+#endif
+ return 0;
+}
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
.owner = THIS_MODULE,
+ .set_layoutdriver = ff_layout_set_layoutdriver,
.alloc_layout_hdr = ff_layout_alloc_layout_hdr,
.free_layout_hdr = ff_layout_free_layout_hdr,
.alloc_lseg = ff_layout_alloc_lseg,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 457cfeb1d5c1..6df7a0cf5660 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
- if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+ /*
+ * check for valid major/minor combination.
+ * currently we support dataserver which talk:
+ * v3, v4.0, v4.1, v4.2
+ */
+ if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) ||
+ (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) {
dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
i, ds_versions[i].version,
ds_versions[i].minor_version);
@@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
- if (ds->ds_clp) {
+ if (!status) {
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f489a5a71bd5..1de93ba78dc9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
if (need_atime || nfs_need_revalidate_inode(inode)) {
struct nfs_server *server = NFS_SERVER(inode);
- nfs_readdirplus_parent_cache_miss(path->dentry);
+ if (!(server->flags & NFS_MOUNT_NOAC))
+ nfs_readdirplus_parent_cache_miss(path->dentry);
+ else
+ nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
} else
nfs_readdirplus_parent_cache_hit(path->dentry);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e03..8701d7617964 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -7,6 +7,7 @@
#include <linux/security.h>
#include <linux/crc32.h>
#include <linux/nfs_page.h>
+#include <linux/wait_bit.h>
#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
@@ -398,7 +399,6 @@ extern struct file_system_type nfs4_referral_fs_type;
bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
struct nfs_subversion *);
-void nfs_initialise_sb(struct super_block *);
int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *);
int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *);
struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *,
@@ -458,7 +458,6 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
/* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
void nfs_umount_begin(struct super_block *);
int nfs_statfs(struct dentry *, struct kstatfs *);
int nfs_show_options(struct seq_file *, struct dentry *);
@@ -495,7 +494,6 @@ void nfs_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -756,9 +754,13 @@ static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
case -ERESTARTSYS:
+ case -EACCES:
+ case -EDQUOT:
+ case -EFBIG:
case -EIO:
case -ENOSPC:
case -EROFS:
+ case -ESTALE:
case -E2BIG:
return true;
default:
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 786f17580582..e5686be67be8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct nfs_fh *fh = NULL;
struct nfs_fattr *fattr = NULL;
- dprintk("--> nfs_d_automount()\n");
-
- mnt = ERR_PTR(-ESTALE);
if (IS_ROOT(path->dentry))
- goto out_nofree;
+ return ERR_PTR(-ESTALE);
mnt = ERR_PTR(-ENOMEM);
fh = nfs_alloc_fhandle();
@@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (fh == NULL || fattr == NULL)
goto out;
- dprintk("%s: enter\n", __func__);
-
mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);
if (IS_ERR(mnt))
goto out;
- dprintk("%s: done, success\n", __func__);
mntget(mnt); /* prevent immediate expiration */
mnt_set_expiry(mnt, &nfs_automount_list);
schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
@@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path)
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fh);
-out_nofree:
- if (IS_ERR(mnt))
- dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
- else
- dprintk("<-- %s() = %p\n", __func__, mnt);
return mnt;
}
@@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
.fattr = fattr,
.authflavor = authflavor,
};
- struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+ struct vfsmount *mnt;
char *page = (char *) __get_free_page(GFP_USER);
char *devname;
- dprintk("--> nfs_do_submount()\n");
-
- dprintk("%s: submounting on %pd2\n", __func__,
- dentry);
if (page == NULL)
- goto out;
+ return ERR_PTR(-ENOMEM);
+
devname = nfs_devname(dentry, page, PAGE_SIZE);
- mnt = (struct vfsmount *)devname;
if (IS_ERR(devname))
- goto free_page;
- mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
-free_page:
- free_page((unsigned long)page);
-out:
- dprintk("%s: done\n", __func__);
+ mnt = ERR_CAST(devname);
+ else
+ mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
- dprintk("<-- nfs_do_submount() = %p\n", mnt);
+ free_page((unsigned long)page);
return mnt;
}
EXPORT_SYMBOL_GPL(nfs_do_submount);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index dc925b531f32..0c07b567118d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
}
+static void nfs3_nlm_alloc_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ get_nfs_open_context(l_ctx->open_context);
+ nfs_get_lock_context(l_ctx->open_context);
+ }
+}
+
+static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags))
+ return nfs_async_iocounter_wait(task, l_ctx);
+ return false;
+
+}
+
+static void nfs3_nlm_release_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ struct nfs_open_context *ctx;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ ctx = l_ctx->open_context;
+ nfs_put_lock_context(l_ctx);
+ put_nfs_open_context(ctx);
+ }
+}
+
+const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+ .nlmclnt_alloc_call = nfs3_nlm_alloc_call,
+ .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
+ .nlmclnt_release_call = nfs3_nlm_release_call,
+};
+
static int
nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(filp);
+ struct nfs_lock_context *l_ctx = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+ int status;
- return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+ if (fl->fl_flags & FL_CLOSE) {
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ l_ctx = NULL;
+ else
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+ }
+
+ status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx);
+
+ if (l_ctx)
+ nfs_put_lock_context(l_ctx);
+
+ return status;
}
static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
@@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.dir_inode_ops = &nfs3_dir_inode_operations,
.file_inode_ops = &nfs3_file_inode_operations,
.file_ops = &nfs_file_operations,
+ .nlmclnt_ops = &nlmclnt_fl_close_lock_ops,
.getroot = nfs3_proc_get_root,
.submount = nfs_submount,
.try_mount = nfs_try_mount,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 1e486c73ec94..319a47db218d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src,
if (status)
return status;
+ res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ if (!res->commit_res.verf)
+ return -ENOMEM;
status = nfs4_call_sync(server->client, server, &msg,
&args->seq_args, &res->seq_res, 0);
if (status == -ENOTSUPP)
server->caps &= ~NFS_CAP_COPY;
if (status)
- return status;
+ goto out;
- if (res->write_res.verifier.committed != NFS_FILE_SYNC) {
- status = nfs_commit_file(dst, &res->write_res.verifier.verifier);
- if (status)
- return status;
+ if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &res->commit_res.verf->verifier)) {
+ status = -EAGAIN;
+ goto out;
}
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
- return res->write_res.count;
+ status = res->write_res.count;
+out:
+ kfree(res->commit_res.verf);
+ return status;
}
ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
@@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
if (err == -ENOTSUPP) {
err = -EOPNOTSUPP;
break;
+ } if (err == -EAGAIN) {
+ dst_exception.retry = 1;
+ continue;
}
err2 = nfs4_handle_exception(server, err, &src_exception);
@@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
} else
spin_unlock(&inode->i_lock);
break;
@@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
}
-
- dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6c7296454bbc..528362f69cc1 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -66,12 +66,14 @@
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
- encode_copy_maxsz)
+ encode_copy_maxsz + \
+ encode_commit_maxsz)
#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
- decode_copy_maxsz)
+ decode_copy_maxsz + \
+ decode_commit_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
@@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
encode_nops(&hdr);
}
+static void encode_copy_commit(struct xdr_stream *xdr,
+ struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->dst_pos);
+ *p = cpu_to_be32(args->count);
+}
+
/*
* Encode COPY request
*/
@@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
encode_savefh(xdr, &hdr);
encode_putfh(xdr, args->dst_fh, &hdr);
encode_copy(xdr, args, &hdr);
+ encode_copy_commit(xdr, args, &hdr);
encode_nops(&hdr);
}
@@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_copy(xdr, res);
+ if (status)
+ goto out;
+ status = decode_commit(xdr, &res->commit_res);
out:
return status;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8346ccbf2d52..66776f022111 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
struct nfs_client *old;
int error;
- if (clp->cl_cons_state == NFS_CS_READY) {
+ if (clp->cl_cons_state == NFS_CS_READY)
/* the client is initialised already */
- dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
return clp;
- }
/* Check NFS protocol revision and initialize RPC op vector */
clp->rpc_ops = &nfs_v4_clientops;
@@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
error:
nfs_mark_client_ready(clp, error);
nfs_put_client(clp);
- dprintk("<-- nfs4_init_client() = xerror %d\n", error);
return ERR_PTR(error);
}
@@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
}
+static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
+ struct nfs_client **prev, struct nfs_net *nn)
+{
+ int status;
+
+ if (pos->rpc_ops != new->rpc_ops)
+ return 1;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ return 1;
+
+ /* If "pos" isn't marked ready, we can't trust the
+ * remaining fields in "pos", especially the client
+ * ID and serverowner fields. Wait for CREATE_SESSION
+ * to finish. */
+ if (pos->cl_cons_state > NFS_CS_READY) {
+ atomic_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(*prev);
+ *prev = pos;
+
+ status = nfs_wait_client_init_complete(pos);
+ spin_lock(&nn->nfs_client_lock);
+
+ if (status < 0)
+ return status;
+ }
+
+ if (pos->cl_cons_state != NFS_CS_READY)
+ return 1;
+
+ if (pos->cl_clientid != new->cl_clientid)
+ return 1;
+
+ /* NFSv4.1 always uses the uniform string, however someone
+ * might switch the uniquifier string on us.
+ */
+ if (!nfs4_match_client_owner_id(pos, new))
+ return 1;
+
+ return 0;
+}
+
/**
* nfs40_walk_client_list - Find server that recognizes a client ID
*
@@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
- /* If "pos" isn't marked ready, we can't trust the
- * remaining fields in "pos" */
- if (pos->cl_cons_state > NFS_CS_READY) {
- atomic_inc(&pos->cl_count);
- spin_unlock(&nn->nfs_client_lock);
-
- nfs_put_client(prev);
- prev = pos;
-
- status = nfs_wait_client_init_complete(pos);
- if (status < 0)
- goto out;
- status = -NFS4ERR_STALE_CLIENTID;
- spin_lock(&nn->nfs_client_lock);
- }
- if (pos->cl_cons_state != NFS_CS_READY)
- continue;
-
- if (pos->cl_clientid != new->cl_clientid)
- continue;
-
- if (!nfs4_match_client_owner_id(pos, new))
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out_unlock;
+ if (status != 0)
continue;
/*
* We just sent a new SETCLIENTID, which should have
@@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
prev = NULL;
*result = pos;
- dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
- __func__, pos, atomic_read(&pos->cl_count));
goto out;
case -ERESTARTSYS:
case -ETIMEDOUT:
@@ -572,32 +587,17 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
}
+out_unlock:
spin_unlock(&nn->nfs_client_lock);
/* No match found. The server lost our clientid */
out:
nfs_put_client(prev);
- dprintk("NFS: <-- %s status = %d\n", __func__, status);
return status;
}
#ifdef CONFIG_NFS_V4_1
/*
- * Returns true if the client IDs match
- */
-static bool nfs4_match_clientids(u64 a, u64 b)
-{
- if (a != b) {
- dprintk("NFS: --> %s client ID %llx does not match %llx\n",
- __func__, a, b);
- return false;
- }
- dprintk("NFS: --> %s client ID %llx matches %llx\n",
- __func__, a, b);
- return true;
-}
-
-/*
* Returns true if the server major ids match
*/
static bool
@@ -605,36 +605,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
struct nfs41_server_owner *o2)
{
if (o1->major_id_sz != o2->major_id_sz)
- goto out_major_mismatch;
- if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
- goto out_major_mismatch;
-
- dprintk("NFS: --> %s server owner major IDs match\n", __func__);
- return true;
-
-out_major_mismatch:
- dprintk("NFS: --> %s server owner major IDs do not match\n",
- __func__);
- return false;
-}
-
-/*
- * Returns true if server minor ids match
- */
-static bool
-nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
- struct nfs41_server_owner *o2)
-{
- /* Check eir_server_owner so_minor_id */
- if (o1->minor_id != o2->minor_id)
- goto out_minor_mismatch;
-
- dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
- return true;
-
-out_minor_mismatch:
- dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
- return false;
+ return false;
+ return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0;
}
/*
@@ -645,18 +617,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1,
struct nfs41_server_scope *s2)
{
if (s1->server_scope_sz != s2->server_scope_sz)
- goto out_scope_mismatch;
- if (memcmp(s1->server_scope, s2->server_scope,
- s1->server_scope_sz) != 0)
- goto out_scope_mismatch;
-
- dprintk("NFS: --> %s server scopes match\n", __func__);
- return true;
-
-out_scope_mismatch:
- dprintk("NFS: --> %s server scopes do not match\n",
- __func__);
- return false;
+ return false;
+ return memcmp(s1->server_scope, s2->server_scope,
+ s1->server_scope_sz) == 0;
}
/**
@@ -680,7 +643,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp,
struct rpc_xprt *xprt)
{
/* Check eir_clientid */
- if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+ if (clp->cl_clientid != res->clientid)
goto out_err;
/* Check eir_server_owner so_major_id */
@@ -689,8 +652,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp,
goto out_err;
/* Check eir_server_owner so_minor_id */
- if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
- res->server_owner))
+ if (clp->cl_serverowner->minor_id != res->server_owner->minor_id)
goto out_err;
/* Check eir_server_scope */
@@ -739,33 +701,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos == new)
goto found;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
- /* If "pos" isn't marked ready, we can't trust the
- * remaining fields in "pos", especially the client
- * ID and serverowner fields. Wait for CREATE_SESSION
- * to finish. */
- if (pos->cl_cons_state > NFS_CS_READY) {
- atomic_inc(&pos->cl_count);
- spin_unlock(&nn->nfs_client_lock);
-
- nfs_put_client(prev);
- prev = pos;
-
- status = nfs_wait_client_init_complete(pos);
- spin_lock(&nn->nfs_client_lock);
- if (status < 0)
- break;
- status = -NFS4ERR_STALE_CLIENTID;
- }
- if (pos->cl_cons_state != NFS_CS_READY)
- continue;
-
- if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out;
+ if (status != 0)
continue;
/*
@@ -777,23 +716,15 @@ int nfs41_walk_client_list(struct nfs_client *new,
new->cl_serverowner))
continue;
- /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
- * uniform string, however someone might switch the
- * uniquifier string on us.
- */
- if (!nfs4_match_client_owner_id(pos, new))
- continue;
found:
atomic_inc(&pos->cl_count);
*result = pos;
status = 0;
- dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
- __func__, pos, atomic_read(&pos->cl_count));
break;
}
+out:
spin_unlock(&nn->nfs_client_lock);
- dprintk("NFS: <-- %s status = %d\n", __func__, status);
nfs_put_client(prev);
return status;
}
@@ -916,9 +847,6 @@ static int nfs4_set_client(struct nfs_server *server,
.timeparms = timeparms,
};
struct nfs_client *clp;
- int error;
-
- dprintk("--> nfs4_set_client()\n");
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
@@ -927,15 +855,11 @@ static int nfs4_set_client(struct nfs_server *server,
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
- if (IS_ERR(clp)) {
- error = PTR_ERR(clp);
- goto error;
- }
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
- if (server->nfs_client == clp) {
- error = -ELOOP;
- goto error;
- }
+ if (server->nfs_client == clp)
+ return -ELOOP;
/*
* Query for the lease time on clientid setup or renewal
@@ -947,11 +871,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
server->nfs_client = clp;
- dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
return 0;
-error:
- dprintk("<-- nfs4_set_client() = xerror %d\n", error);
- return error;
}
/*
@@ -982,7 +902,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
};
- struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
@@ -998,10 +917,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init);
-
- dprintk("<-- %s %p\n", __func__, clp);
- return clp;
+ return nfs_get_client(&cl_init);
}
EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
@@ -1098,8 +1014,6 @@ static int nfs4_init_server(struct nfs_server *server,
struct rpc_timeout timeparms;
int error;
- dprintk("--> nfs4_init_server()\n");
-
nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
data->timeo, data->retrans);
@@ -1127,7 +1041,7 @@ static int nfs4_init_server(struct nfs_server *server,
data->minorversion,
data->net);
if (error < 0)
- goto error;
+ return error;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1138,16 +1052,10 @@ static int nfs4_init_server(struct nfs_server *server,
server->acregmax = data->acregmax * HZ;
server->acdirmin = data->acdirmin * HZ;
server->acdirmax = data->acdirmax * HZ;
+ server->port = data->nfs_server.port;
- server->port = data->nfs_server.port;
-
- error = nfs_init_server_rpcclient(server, &timeparms,
- data->selected_flavor);
-
-error:
- /* Done */
- dprintk("<-- nfs4_init_server() = %d\n", error);
- return error;
+ return nfs_init_server_rpcclient(server, &timeparms,
+ data->selected_flavor);
}
/*
@@ -1163,8 +1071,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
bool auth_probe;
int error;
- dprintk("--> nfs4_create_server()\n");
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1180,12 +1086,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
if (error < 0)
goto error;
- dprintk("<-- nfs4_create_server() = %p\n", server);
return server;
error:
nfs_free_server(server);
- dprintk("<-- nfs4_create_server() = error %d\n", error);
return ERR_PTR(error);
}
@@ -1200,8 +1104,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
bool auth_probe;
int error;
- dprintk("--> nfs4_create_referral_server()\n");
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1235,12 +1137,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- dprintk("<-- nfs_create_referral_server() = %p\n", server);
return server;
error:
nfs_free_server(server);
- dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
return ERR_PTR(error);
}
@@ -1300,31 +1200,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
struct sockaddr *localaddr = (struct sockaddr *)&address;
int error;
- dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
- (unsigned long long)server->fsid.major,
- (unsigned long long)server->fsid.minor,
- hostname);
-
error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
- if (error != 0) {
- dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
- __func__, error);
- goto out;
- }
+ if (error != 0)
+ return error;
error = rpc_localaddr(clnt, localaddr, sizeof(address));
- if (error != 0) {
- dprintk("<-- %s(): rpc_localaddr returned %d\n",
- __func__, error);
- goto out;
- }
+ if (error != 0)
+ return error;
- error = -EAFNOSUPPORT;
- if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
- dprintk("<-- %s(): rpc_ntop returned %d\n",
- __func__, error);
- goto out;
- }
+ if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0)
+ return -EAFNOSUPPORT;
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
@@ -1333,21 +1218,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_put_client(clp);
if (error != 0) {
nfs_server_insert_lists(server);
- dprintk("<-- %s(): nfs4_set_client returned %d\n",
- __func__, error);
- goto out;
+ return error;
}
if (server->nfs_client->cl_hostname == NULL)
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
nfs_server_insert_lists(server);
- error = nfs_probe_destination(server);
- if (error < 0)
- goto out;
-
- dprintk("<-- %s() succeeded\n", __func__);
-
-out:
- return error;
+ return nfs_probe_destination(server);
}
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index 039b3eb6d834..ac8406018962 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
struct nfs_fsinfo fsinfo;
int ret = -ENOMEM;
- dprintk("--> nfs4_get_rootfh()\n");
-
fsinfo.fattr = nfs_alloc_fattr();
if (fsinfo.fattr == NULL)
goto out;
@@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
out:
nfs_free_fattr(fsinfo.fattr);
- dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
return ret;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d8b040bd9814..7d531da1bae3 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
out:
free_page((unsigned long) page);
free_page((unsigned long) page2);
- dprintk("%s: done\n", __func__);
return mnt;
}
@@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
int err;
/* BUG_ON(IS_ROOT(dentry)); */
- dprintk("%s: enter\n", __func__);
-
page = alloc_page(GFP_KERNEL);
if (page == NULL)
- goto out;
+ return mnt;
fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (fs_locations == NULL)
@@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
out_free:
__free_page(page);
kfree(fs_locations);
-out:
- dprintk("%s: done\n", __func__);
return mnt;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 201ca3f2c4ba..98b0b662af09 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task,
session = slot->table->session;
if (slot->interrupted) {
- slot->interrupted = 0;
+ if (res->sr_status != -NFS4ERR_DELAY)
+ slot->interrupted = 0;
interrupted = true;
}
@@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
if (status != 0)
return status;
}
- if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
+ if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ nfs4_sequence_free_slot(&o_res->seq_res);
nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+ }
return 0;
}
@@ -2586,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
/* Except MODE, it seems harmless of setting twice. */
if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
- attrset[1] & FATTR4_WORD1_MODE)
+ (attrset[1] & FATTR4_WORD1_MODE ||
+ attrset[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_valid &= ~ATTR_MODE;
if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
@@ -3265,6 +3269,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
.rpc_resp = &res,
};
int status;
+ int i;
bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
FATTR4_WORD0_FH_EXPIRE_TYPE |
@@ -3330,8 +3335,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->cache_consistency_bitmask[2] = 0;
+
+ /* Avoid a regression due to buggy server */
+ for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++)
+ res.exclcreat_bitmask[i] &= res.attr_bitmask[i];
memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
sizeof(server->exclcreat_bitmask));
+
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -4610,7 +4620,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
return 0;
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context,
- hdr->rw_ops->rw_mode) == -EIO)
+ hdr->rw_mode) == -EIO)
return -EIO;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
return -EIO;
@@ -4804,8 +4814,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
if (!atomic_inc_not_zero(&clp->cl_count))
return -EIO;
data = kmalloc(sizeof(*data), GFP_NOFS);
- if (data == NULL)
+ if (data == NULL) {
+ nfs_put_client(clp);
return -ENOMEM;
+ }
data->client = clp;
data->timestamp = jiffies;
return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
@@ -5782,6 +5794,7 @@ struct nfs4_unlockdata {
struct nfs_locku_res res;
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
struct file_lock fl;
struct nfs_server *server;
unsigned long timestamp;
@@ -5806,6 +5819,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
atomic_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
+ p->l_ctx = nfs_get_lock_context(ctx);
memcpy(&p->fl, fl, sizeof(p->fl));
p->server = NFS_SERVER(inode);
return p;
@@ -5816,6 +5830,7 @@ static void nfs4_locku_release_calldata(void *data)
struct nfs4_unlockdata *calldata = data;
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_lock_state(calldata->lsp);
+ nfs_put_lock_context(calldata->l_ctx);
put_nfs_open_context(calldata->ctx);
kfree(calldata);
}
@@ -5857,6 +5872,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
{
struct nfs4_unlockdata *calldata = data;
+ if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) &&
+ nfs_async_iocounter_wait(task, calldata->l_ctx))
+ return;
+
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
@@ -5908,6 +5927,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
* canceled lock is passed in, and it won't be an unlock.
*/
fl->fl_type = F_UNLCK;
+ if (fl->fl_flags & FL_CLOSE)
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
if (data == NULL) {
@@ -6352,7 +6373,7 @@ struct nfs4_lock_waiter {
};
static int
-nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
{
int ret;
struct cb_notify_lock_args *cbnl = key;
@@ -6395,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
.inode = state->inode,
.owner = &owner,
.notified = false };
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Don't bother with waitqueue if we don't expect a callback */
if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
@@ -6445,9 +6466,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
ctx = nfs_file_open_context(filp);
state = ctx->state;
- if (request->fl_start < 0 || request->fl_end < 0)
- return -EINVAL;
-
if (IS_GETLK(cmd)) {
if (state != NULL)
return nfs4_proc_getlk(state, F_GETLK, request);
@@ -6470,20 +6488,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
return -ENOLCK;
- /*
- * Don't rely on the VFS having checked the file open mode,
- * since it won't do this for flock() locks.
- */
- switch (request->fl_type) {
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- return -EBADF;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
- }
-
status = nfs4_set_lock_state(state, request);
if (status != 0)
return status;
@@ -7155,8 +7159,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
};
struct rpc_task *task;
- dprintk("--> %s\n", __func__);
-
nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
args.dir = NFS4_CDFC4_FORE;
@@ -7176,24 +7178,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
if (memcmp(res.sessionid.data,
clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
dprintk("NFS: %s: Session ID mismatch\n", __func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
if ((res.dir & args.dir) != res.dir || res.dir == 0) {
dprintk("NFS: %s: Unexpected direction from server\n",
__func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
dprintk("NFS: %s: Server returned RDMA mode = true\n",
__func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
}
-out:
- dprintk("<-- %s status= %d\n", __func__, status);
+
return status;
}
@@ -7459,15 +7457,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
};
struct nfs41_exchange_id_data *calldata;
struct rpc_task *task;
- int status = -EIO;
+ int status;
if (!atomic_inc_not_zero(&clp->cl_count))
- goto out;
+ return -EIO;
- status = -ENOMEM;
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
- if (!calldata)
- goto out;
+ if (!calldata) {
+ nfs_put_client(clp);
+ return -ENOMEM;
+ }
if (!xprt)
nfs4_init_boot_verifier(clp, &verifier);
@@ -7476,10 +7475,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (status)
goto out_calldata;
- dprintk("NFS call exchange_id auth=%s, '%s'\n",
- clp->cl_rpcclient->cl_auth->au_ops->au_name,
- clp->cl_owner_id);
-
calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
GFP_NOFS);
status = -ENOMEM;
@@ -7545,13 +7540,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
rpc_put_task(task);
out:
- if (clp->cl_implid != NULL)
- dprintk("NFS reply exchange_id: Server Implementation ID: "
- "domain: %s, name: %s, date: %llu,%u\n",
- clp->cl_implid->domain, clp->cl_implid->name,
- clp->cl_implid->date.seconds,
- clp->cl_implid->date.nseconds);
- dprintk("NFS reply exchange_id: %d\n", status);
return status;
out_impl_id:
@@ -7769,17 +7757,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
nfs4_set_sequence_privileged(&args.la_seq_args);
- dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup);
if (IS_ERR(task))
- status = PTR_ERR(task);
- else {
- status = task->tk_status;
- rpc_put_task(task);
- }
- dprintk("<-- %s return %d\n", __func__, status);
+ return PTR_ERR(task);
+ status = task->tk_status;
+ rpc_put_task(task);
return status;
}
@@ -8180,6 +8164,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
/* fall through */
case -NFS4ERR_RETRY_UNCACHED_REP:
return -EAGAIN;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ nfs4_schedule_session_recovery(clp->cl_session,
+ task->tk_status);
+ break;
default:
nfs4_schedule_lease_recovery(clp);
}
@@ -8258,7 +8248,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
if (status == 0)
status = task->tk_status;
rpc_put_task(task);
- return 0;
out:
dprintk("<-- %s status=%d\n", __func__, status);
return status;
@@ -8357,6 +8346,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
*/
pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
pnfs_free_lseg_list(&head);
status = -EAGAIN;
goto out;
@@ -8427,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata)
size_t max_pages = max_response_pages(server);
dprintk("--> %s\n", __func__);
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
nfs4_free_pages(lgp->args.layout.pages, max_pages);
pnfs_put_layout_hdr(NFS_I(inode)->layout);
put_nfs_open_context(lgp->args.ctx);
@@ -8501,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
- nfs4_sequence_free_slot(&lgp->res.seq_res);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
if (status)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 8156bad6b441..cbf82b0d4467 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
}
-static void nfs4_reclaim_complete(struct nfs_client *clp,
+static int nfs4_reclaim_complete(struct nfs_client *clp,
const struct nfs4_state_recovery_ops *ops,
struct rpc_cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
- (void)ops->reclaim_complete(clp, cred);
+ return ops->reclaim_complete(clp, cred);
+ return 0;
}
static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
const struct nfs4_state_recovery_ops *ops;
struct rpc_cred *cred;
+ int err;
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
ops = clp->cl_mvops->reboot_recovery_ops;
cred = nfs4_get_clid_cred(clp);
- nfs4_reclaim_complete(clp, ops, cred);
+ err = nfs4_reclaim_complete(clp, ops, cred);
put_rpccred(cred);
+ if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
}
static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
@@ -2130,6 +2134,8 @@ again:
put_rpccred(cred);
switch (status) {
case 0:
+ case -EINTR:
+ case -ERESTARTSYS:
break;
case -ETIMEDOUT:
if (clnt->cl_softrtry)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 80ce289eea05..3aebfdc82b30 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
+ const umode_t *umask,
const struct nfs_server *server,
- bool excl_check, const umode_t *umask)
+ const uint32_t attrmask[])
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
/*
* We reserve enough space to write the entire attribute buffer at once.
*/
- if (iap->ia_valid & ATTR_SIZE) {
+ if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) {
bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
}
- if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
- umask = NULL;
if (iap->ia_valid & ATTR_MODE) {
- if (umask) {
+ if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) {
bmval[2] |= FATTR4_WORD2_MODE_UMASK;
len += 8;
- } else {
+ } else if (attrmask[1] & FATTR4_WORD1_MODE) {
bmval[1] |= FATTR4_WORD1_MODE;
len += 4;
}
}
- if (iap->ia_valid & ATTR_UID) {
+ if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
dprintk("nfs: couldn't resolve uid %d to string\n",
@@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_OWNER;
len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
}
- if (iap->ia_valid & ATTR_GID) {
+ if ((iap->ia_valid & ATTR_GID) &&
+ (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) {
owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
if (owner_grouplen < 0) {
dprintk("nfs: couldn't resolve gid %d to string\n",
@@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
}
- if (iap->ia_valid & ATTR_ATIME_SET) {
- bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
- len += 16;
- } else if (iap->ia_valid & ATTR_ATIME) {
- bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
- len += 4;
- }
- if (iap->ia_valid & ATTR_MTIME_SET) {
- bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
- len += 16;
- } else if (iap->ia_valid & ATTR_MTIME) {
- bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
- len += 4;
+ if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 16;
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4;
+ }
}
-
- if (excl_check) {
- const u32 *excl_bmval = server->exclcreat_bitmask;
- bmval[0] &= excl_bmval[0];
- bmval[1] &= excl_bmval[1];
- bmval[2] &= excl_bmval[2];
-
- if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
- label = NULL;
+ if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 16;
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4;
+ }
}
- if (label) {
+ if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) {
len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
}
@@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server, false,
- &create->umask);
+ encode_attrs(xdr, create->attrs, create->label, &create->umask,
+ create->server, create->server->attr_bitmask);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->exclcreat_bitmask);
}
}
@@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server, false, NULL);
+ encode_attrs(xdr, arg->iap, arg->label, NULL, server,
+ server->attr_bitmask);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
- NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
- NFS_I(inode)->layout, xdr, args);
- } else {
- encode_uint32(xdr, args->layoutupdate_len);
- if (args->layoutupdate_pages) {
- xdr_write_pages(xdr, args->layoutupdate_pages, 0,
- args->layoutupdate_len);
- }
- }
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages)
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
return 0;
}
@@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args,
struct compound_hdr *hdr)
{
- const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld;
__be32 *p;
encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
@@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr,
spin_unlock(&args->inode->i_lock);
if (args->ld_private->ops && args->ld_private->ops->encode)
args->ld_private->ops->encode(xdr, args, args->ld_private);
- else if (lr_ops->encode_layoutreturn)
- lr_ops->encode_layoutreturn(xdr, args);
else
encode_uint32(xdr, 0);
}
@@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
unsigned int i;
p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
bitmap_words = be32_to_cpup(p++);
if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
return -EIO;
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
deleted file mode 100644
index ed30ea072bb8..000000000000
--- a/fs/nfs/objlayout/Kbuild
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the pNFS Objects Layout Driver kernel module
-#
-objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
-obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
deleted file mode 100644
index 049c1b1f2932..000000000000
--- a/fs/nfs/objlayout/objio_osd.c
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- * pNFS Objects layout implementation over open-osd initiator library
- *
- * Copyright (C) 2009 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <scsi/osd_ore.h>
-
-#include "objlayout.h"
-#include "../internal.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-struct objio_dev_ent {
- struct nfs4_deviceid_node id_node;
- struct ore_dev od;
-};
-
-static void
-objio_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
- struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
-
- dprintk("%s: free od=%p\n", __func__, de->od.od);
- osduld_put_device(de->od.od);
- kfree_rcu(d, rcu);
-}
-
-struct objio_segment {
- struct pnfs_layout_segment lseg;
-
- struct ore_layout layout;
- struct ore_components oc;
-};
-
-static inline struct objio_segment *
-OBJIO_LSEG(struct pnfs_layout_segment *lseg)
-{
- return container_of(lseg, struct objio_segment, lseg);
-}
-
-struct objio_state {
- /* Generic layer */
- struct objlayout_io_res oir;
-
- bool sync;
- /*FIXME: Support for extra_bytes at ore_get_rw_state() */
- struct ore_io_state *ios;
-};
-
-/* Send and wait for a get_device_info of devices in the layout,
- then look them up with the osd_initiator library */
-struct nfs4_deviceid_node *
-objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
- gfp_t gfp_flags)
-{
- struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode = NULL;
- struct osd_dev *od;
- struct osd_dev_info odi;
- bool retry_flag = true;
- __be32 *p;
- int err;
-
- deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
- if (!deviceaddr)
- return NULL;
-
- p = page_address(pdev->pages[0]);
- pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
-
- odi.systemid_len = deviceaddr->oda_systemid.len;
- if (odi.systemid_len > sizeof(odi.systemid)) {
- dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
- __func__, sizeof(odi.systemid));
- err = -EINVAL;
- goto out;
- } else if (odi.systemid_len)
- memcpy(odi.systemid, deviceaddr->oda_systemid.data,
- odi.systemid_len);
- odi.osdname_len = deviceaddr->oda_osdname.len;
- odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
-
- if (!odi.osdname_len && !odi.systemid_len) {
- dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
- __func__);
- err = -ENODEV;
- goto out;
- }
-
-retry_lookup:
- od = osduld_info_lookup(&odi);
- if (IS_ERR(od)) {
- err = PTR_ERR(od);
- dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
- if (err == -ENODEV && retry_flag) {
- err = objlayout_autologin(deviceaddr);
- if (likely(!err)) {
- retry_flag = false;
- goto retry_lookup;
- }
- }
- goto out;
- }
-
- dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
-
- ode = kzalloc(sizeof(*ode), gfp_flags);
- if (!ode) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- goto out;
- }
-
- nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
- kfree(deviceaddr);
-
- ode->od.od = od;
- return &ode->id_node;
-
-out:
- kfree(deviceaddr);
- return NULL;
-}
-
-static void copy_single_comp(struct ore_components *oc, unsigned c,
- struct pnfs_osd_object_cred *src_comp)
-{
- struct ore_comp *ocomp = &oc->comps[c];
-
- WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
- WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
-
- ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
- ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-
- memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
-}
-
-static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
- struct objio_segment **pseg)
-{
-/* This is the in memory structure of the objio_segment
- *
- * struct __alloc_objio_segment {
- * struct objio_segment olseg;
- * struct ore_dev *ods[numdevs];
- * struct ore_comp comps[numdevs];
- * } *aolseg;
- * NOTE: The code as above compiles and runs perfectly. It is elegant,
- * type safe and compact. At some Past time Linus has decided he does not
- * like variable length arrays, For the sake of this principal we uglify
- * the code as below.
- */
- struct objio_segment *lseg;
- size_t lseg_size = sizeof(*lseg) +
- numdevs * sizeof(lseg->oc.ods[0]) +
- numdevs * sizeof(*lseg->oc.comps);
-
- lseg = kzalloc(lseg_size, gfp_flags);
- if (unlikely(!lseg)) {
- dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
- numdevs, lseg_size);
- return -ENOMEM;
- }
-
- lseg->oc.numdevs = numdevs;
- lseg->oc.single_comp = EC_MULTPLE_COMPS;
- lseg->oc.ods = (void *)(lseg + 1);
- lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
-
- *pseg = lseg;
- return 0;
-}
-
-int objio_alloc_lseg(struct pnfs_layout_segment **outp,
- struct pnfs_layout_hdr *pnfslay,
- struct pnfs_layout_range *range,
- struct xdr_stream *xdr,
- gfp_t gfp_flags)
-{
- struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
- struct objio_segment *objio_seg;
- struct pnfs_osd_xdr_decode_layout_iter iter;
- struct pnfs_osd_layout layout;
- struct pnfs_osd_object_cred src_comp;
- unsigned cur_comp;
- int err;
-
- err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
- if (unlikely(err))
- return err;
-
- err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
- if (unlikely(err))
- return err;
-
- objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
- objio_seg->layout.group_width = layout.olo_map.odm_group_width;
- objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
- objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
- objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
-
- err = ore_verify_layout(layout.olo_map.odm_num_comps,
- &objio_seg->layout);
- if (unlikely(err))
- goto err;
-
- objio_seg->oc.first_dev = layout.olo_comps_index;
- cur_comp = 0;
- while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *ode;
-
- copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-
- d = nfs4_find_get_deviceid(server,
- &src_comp.oc_object_id.oid_device_id,
- pnfslay->plh_lc_cred, gfp_flags);
- if (!d) {
- err = -ENXIO;
- goto err;
- }
-
- ode = container_of(d, struct objio_dev_ent, id_node);
- objio_seg->oc.ods[cur_comp++] = &ode->od;
- }
- /* pnfs_osd_xdr_decode_layout_comp returns false on error */
- if (unlikely(err))
- goto err;
-
- *outp = &objio_seg->lseg;
- return 0;
-
-err:
- kfree(objio_seg);
- dprintk("%s: Error: return %d\n", __func__, err);
- *outp = NULL;
- return err;
-}
-
-void objio_free_lseg(struct pnfs_layout_segment *lseg)
-{
- int i;
- struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-
- for (i = 0; i < objio_seg->oc.numdevs; i++) {
- struct ore_dev *od = objio_seg->oc.ods[i];
- struct objio_dev_ent *ode;
-
- if (!od)
- break;
- ode = container_of(od, typeof(*ode), od);
- nfs4_put_deviceid_node(&ode->id_node);
- }
- kfree(objio_seg);
-}
-
-static int
-objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
- struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
- loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
- struct objio_state **outp)
-{
- struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- struct ore_io_state *ios;
- int ret;
- struct __alloc_objio_state {
- struct objio_state objios;
- struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
- } *aos;
-
- aos = kzalloc(sizeof(*aos), gfp_flags);
- if (unlikely(!aos))
- return -ENOMEM;
-
- objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
- aos->ioerrs, rpcdata, pnfs_layout_type);
-
- ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
- offset, count, &ios);
- if (unlikely(ret)) {
- kfree(aos);
- return ret;
- }
-
- ios->pages = pages;
- ios->pgbase = pgbase;
- ios->private = aos;
- BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
-
- aos->objios.sync = 0;
- aos->objios.ios = ios;
- *outp = &aos->objios;
- return 0;
-}
-
-void objio_free_result(struct objlayout_io_res *oir)
-{
- struct objio_state *objios = container_of(oir, struct objio_state, oir);
-
- ore_put_io_state(objios->ios);
- kfree(objios);
-}
-
-static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
-{
- switch (oep) {
- case OSD_ERR_PRI_NO_ERROR:
- return (enum pnfs_osd_errno)0;
-
- case OSD_ERR_PRI_CLEAR_PAGES:
- BUG_ON(1);
- return 0;
-
- case OSD_ERR_PRI_RESOURCE:
- return PNFS_OSD_ERR_RESOURCE;
- case OSD_ERR_PRI_BAD_CRED:
- return PNFS_OSD_ERR_BAD_CRED;
- case OSD_ERR_PRI_NO_ACCESS:
- return PNFS_OSD_ERR_NO_ACCESS;
- case OSD_ERR_PRI_UNREACHABLE:
- return PNFS_OSD_ERR_UNREACHABLE;
- case OSD_ERR_PRI_NOT_FOUND:
- return PNFS_OSD_ERR_NOT_FOUND;
- case OSD_ERR_PRI_NO_SPACE:
- return PNFS_OSD_ERR_NO_SPACE;
- default:
- WARN_ON(1);
- /* fallthrough */
- case OSD_ERR_PRI_EIO:
- return PNFS_OSD_ERR_EIO;
- }
-}
-
-static void __on_dev_error(struct ore_io_state *ios,
- struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
- u64 dev_offset, u64 dev_len)
-{
- struct objio_state *objios = ios->private;
- struct pnfs_osd_objid pooid;
- struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
- /* FIXME: what to do with more-then-one-group layouts. We need to
- * translate from ore_io_state index to oc->comps index
- */
- unsigned comp = dev_index;
-
- pooid.oid_device_id = ode->id_node.deviceid;
- pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
- pooid.oid_object_id = ios->oc->comps[comp].obj.id;
-
- objlayout_io_set_result(&objios->oir, comp,
- &pooid, osd_pri_2_pnfs_err(oep),
- dev_offset, dev_len, !ios->reading);
-}
-
-/*
- * read
- */
-static void _read_done(struct ore_io_state *ios, void *private)
-{
- struct objio_state *objios = private;
- ssize_t status;
- int ret = ore_check_io(ios, &__on_dev_error);
-
- /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
-
- if (likely(!ret))
- status = ios->length;
- else
- status = ret;
-
- objlayout_read_done(&objios->oir, status, objios->sync);
-}
-
-int objio_read_pagelist(struct nfs_pgio_header *hdr)
-{
- struct objio_state *objios;
- int ret;
-
- ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
- hdr->lseg, hdr->args.pages, hdr->args.pgbase,
- hdr->args.offset, hdr->args.count, hdr,
- GFP_KERNEL, &objios);
- if (unlikely(ret))
- return ret;
-
- objios->ios->done = _read_done;
- dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- hdr->args.offset, hdr->args.count);
- ret = ore_read(objios->ios);
- if (unlikely(ret))
- objio_free_result(&objios->oir);
- return ret;
-}
-
-/*
- * write
- */
-static void _write_done(struct ore_io_state *ios, void *private)
-{
- struct objio_state *objios = private;
- ssize_t status;
- int ret = ore_check_io(ios, &__on_dev_error);
-
- /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
-
- if (likely(!ret)) {
- /* FIXME: should be based on the OSD's persistence model
- * See OSD2r05 Section 4.13 Data persistence model */
- objios->oir.committed = NFS_FILE_SYNC;
- status = ios->length;
- } else {
- status = ret;
- }
-
- objlayout_write_done(&objios->oir, status, objios->sync);
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
- struct objio_state *objios = priv;
- struct nfs_pgio_header *hdr = objios->oir.rpcdata;
- struct address_space *mapping = hdr->inode->i_mapping;
- pgoff_t index = offset / PAGE_SIZE;
- struct page *page;
- loff_t i_size = i_size_read(hdr->inode);
-
- if (offset >= i_size) {
- *uptodate = true;
- dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
- return ZERO_PAGE(0);
- }
-
- page = find_get_page(mapping, index);
- if (!page) {
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (unlikely(!page)) {
- dprintk("%s: grab_cache_page Failed index=0x%lx\n",
- __func__, index);
- return NULL;
- }
- unlock_page(page);
- }
- *uptodate = PageUptodate(page);
- dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
- return page;
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
- dprintk("%s: index=0x%lx\n", __func__,
- (page == ZERO_PAGE(0)) ? -1UL : page->index);
- if (ZERO_PAGE(0) != page)
- put_page(page);
- return;
-}
-
-static const struct _ore_r4w_op _r4w_op = {
- .get_page = &__r4w_get_page,
- .put_page = &__r4w_put_page,
-};
-
-int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
-{
- struct objio_state *objios;
- int ret;
-
- ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
- hdr->lseg, hdr->args.pages, hdr->args.pgbase,
- hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
- &objios);
- if (unlikely(ret))
- return ret;
-
- objios->sync = 0 != (how & FLUSH_SYNC);
- objios->ios->r4w = &_r4w_op;
-
- if (!objios->sync)
- objios->ios->done = _write_done;
-
- dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- hdr->args.offset, hdr->args.count);
- ret = ore_write(objios->ios);
- if (unlikely(ret)) {
- objio_free_result(&objios->oir);
- return ret;
- }
-
- if (objios->sync)
- _write_done(objios->ios, objios);
-
- return 0;
-}
-
-/*
- * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
- * of bytes (maximum @req->wb_bytes) that can be coalesced.
- */
-static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev, struct nfs_page *req)
-{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
- unsigned int size;
-
- size = pnfs_generic_pg_test(pgio, prev, req);
-
- if (!size || mirror->pg_count + req->wb_bytes >
- (unsigned long)pgio->pg_layout_private)
- return 0;
-
- return min(size, req->wb_bytes);
-}
-
-static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- pnfs_generic_pg_init_read(pgio, req);
- if (unlikely(pgio->pg_lseg == NULL))
- return; /* Not pNFS */
-
- pgio->pg_layout_private = (void *)
- OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
-}
-
-static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
- unsigned long *stripe_end)
-{
- u32 stripe_off;
- unsigned stripe_size;
-
- if (layout->raid_algorithm == PNFS_OSD_RAID_0)
- return true;
-
- stripe_size = layout->stripe_unit *
- (layout->group_width - layout->parity);
-
- div_u64_rem(offset, stripe_size, &stripe_off);
- if (!stripe_off)
- return true;
-
- *stripe_end = stripe_size - stripe_off;
- return false;
-}
-
-static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- unsigned long stripe_end = 0;
- u64 wb_size;
-
- if (pgio->pg_dreq == NULL)
- wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
- else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pnfs_generic_pg_init_write(pgio, req, wb_size);
- if (unlikely(pgio->pg_lseg == NULL))
- return; /* Not pNFS */
-
- if (req->wb_offset ||
- !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
- &OBJIO_LSEG(pgio->pg_lseg)->layout,
- &stripe_end)) {
- pgio->pg_layout_private = (void *)stripe_end;
- } else {
- pgio->pg_layout_private = (void *)
- OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
- }
-}
-
-static const struct nfs_pageio_ops objio_pg_read_ops = {
- .pg_init = objio_init_read,
- .pg_test = objio_pg_test,
- .pg_doio = pnfs_generic_pg_readpages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
-};
-
-static const struct nfs_pageio_ops objio_pg_write_ops = {
- .pg_init = objio_init_write,
- .pg_test = objio_pg_test,
- .pg_doio = pnfs_generic_pg_writepages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
-};
-
-static struct pnfs_layoutdriver_type objlayout_type = {
- .id = LAYOUT_OSD2_OBJECTS,
- .name = "LAYOUT_OSD2_OBJECTS",
- .flags = PNFS_LAYOUTRET_ON_SETATTR |
- PNFS_LAYOUTRET_ON_ERROR,
-
- .max_deviceinfo_size = PAGE_SIZE,
- .owner = THIS_MODULE,
- .alloc_layout_hdr = objlayout_alloc_layout_hdr,
- .free_layout_hdr = objlayout_free_layout_hdr,
-
- .alloc_lseg = objlayout_alloc_lseg,
- .free_lseg = objlayout_free_lseg,
-
- .read_pagelist = objlayout_read_pagelist,
- .write_pagelist = objlayout_write_pagelist,
- .pg_read_ops = &objio_pg_read_ops,
- .pg_write_ops = &objio_pg_write_ops,
-
- .sync = pnfs_generic_sync,
-
- .free_deviceid_node = objio_free_deviceid_node,
-
- .encode_layoutcommit = objlayout_encode_layoutcommit,
- .encode_layoutreturn = objlayout_encode_layoutreturn,
-};
-
-MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
-MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
-MODULE_LICENSE("GPL");
-
-static int __init
-objlayout_init(void)
-{
- int ret = pnfs_register_layoutdriver(&objlayout_type);
-
- if (ret)
- printk(KERN_INFO
- "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
- __func__, ret);
- else
- printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
- __func__);
- return ret;
-}
-
-static void __exit
-objlayout_exit(void)
-{
- pnfs_unregister_layoutdriver(&objlayout_type);
- printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
- __func__);
-}
-
-MODULE_ALIAS("nfs-layouttype4-2");
-
-module_init(objlayout_init);
-module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
deleted file mode 100644
index 8f3d2acb81c3..000000000000
--- a/fs/nfs/objlayout/objlayout.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- * pNFS Objects layout driver high level definitions
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/kmod.h>
-#include <linux/moduleparam.h>
-#include <linux/ratelimit.h>
-#include <scsi/osd_initiator.h>
-#include "objlayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-/*
- * Create a objlayout layout structure for the given inode and return it.
- */
-struct pnfs_layout_hdr *
-objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
-{
- struct objlayout *objlay;
-
- objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
- if (!objlay)
- return NULL;
- spin_lock_init(&objlay->lock);
- INIT_LIST_HEAD(&objlay->err_list);
- dprintk("%s: Return %p\n", __func__, objlay);
- return &objlay->pnfs_layout;
-}
-
-/*
- * Free an objlayout layout structure
- */
-void
-objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
-{
- struct objlayout *objlay = OBJLAYOUT(lo);
-
- dprintk("%s: objlay %p\n", __func__, objlay);
-
- WARN_ON(!list_empty(&objlay->err_list));
- kfree(objlay);
-}
-
-/*
- * Unmarshall layout and store it in pnfslay.
- */
-struct pnfs_layout_segment *
-objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- int status = -ENOMEM;
- struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = lgr->layoutp->pages,
- .page_len = lgr->layoutp->len,
- .buflen = lgr->layoutp->len,
- .len = lgr->layoutp->len,
- };
- struct page *scratch;
- struct pnfs_layout_segment *lseg;
-
- dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- goto err_nofree;
-
- xdr_init_decode(&stream, &buf, NULL);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
- if (unlikely(status)) {
- dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
- status);
- goto err;
- }
-
- __free_page(scratch);
-
- dprintk("%s: Return %p\n", __func__, lseg);
- return lseg;
-
-err:
- __free_page(scratch);
-err_nofree:
- dprintk("%s: Err Return=>%d\n", __func__, status);
- return ERR_PTR(status);
-}
-
-/*
- * Free a layout segement
- */
-void
-objlayout_free_lseg(struct pnfs_layout_segment *lseg)
-{
- dprintk("%s: freeing layout segment %p\n", __func__, lseg);
-
- if (unlikely(!lseg))
- return;
-
- objio_free_lseg(lseg);
-}
-
-/*
- * I/O Operations
- */
-static inline u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
-}
-
-static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
- struct page ***p_pages, unsigned *p_pgbase,
- u64 offset, unsigned long count)
-{
- u64 lseg_end_offset;
-
- BUG_ON(offset < lseg->pls_range.offset);
- lseg_end_offset = end_offset(lseg->pls_range.offset,
- lseg->pls_range.length);
- BUG_ON(offset >= lseg_end_offset);
- WARN_ON(offset + count > lseg_end_offset);
-
- if (*p_pgbase > PAGE_SIZE) {
- dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
- *p_pages += *p_pgbase >> PAGE_SHIFT;
- *p_pgbase &= ~PAGE_MASK;
- }
-}
-
-/*
- * I/O done common code
- */
-static void
-objlayout_iodone(struct objlayout_io_res *oir)
-{
- if (likely(oir->status >= 0)) {
- objio_free_result(oir);
- } else {
- struct objlayout *objlay = oir->objlay;
-
- spin_lock(&objlay->lock);
- objlay->delta_space_valid = OBJ_DSU_INVALID;
- list_add(&objlay->err_list, &oir->err_list);
- spin_unlock(&objlay->lock);
- }
-}
-
-/*
- * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
- *
- * The @index component IO failed (error returned from target). Register
- * the error for later reporting at layout-return.
- */
-void
-objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
- struct pnfs_osd_objid *pooid, int osd_error,
- u64 offset, u64 length, bool is_write)
-{
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
-
- BUG_ON(index >= oir->num_comps);
- if (osd_error) {
- ioerr->oer_component = *pooid;
- ioerr->oer_comp_offset = offset;
- ioerr->oer_comp_length = length;
- ioerr->oer_iswrite = is_write;
- ioerr->oer_errno = osd_error;
-
- dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
- "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
- __func__, index, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
- } else {
- /* User need not call if no error is reported */
- ioerr->oer_errno = 0;
- }
-}
-
-/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
- * This is because the osd completion is called with ints-off from
- * the block layer
- */
-static void _rpc_read_complete(struct work_struct *work)
-{
- struct rpc_task *task;
- struct nfs_pgio_header *hdr;
-
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- hdr = container_of(task, struct nfs_pgio_header, task);
-
- pnfs_ld_read_done(hdr);
-}
-
-void
-objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-{
- struct nfs_pgio_header *hdr = oir->rpcdata;
-
- oir->status = hdr->task.tk_status = status;
- if (status >= 0)
- hdr->res.count = status;
- else
- hdr->pnfs_error = status;
- objlayout_iodone(oir);
- /* must not use oir after this point */
-
- dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
- status, hdr->res.eof, sync);
-
- if (sync)
- pnfs_ld_read_done(hdr);
- else {
- INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
- schedule_work(&hdr->task.u.tk_work);
- }
-}
-
-/*
- * Perform sync or async reads.
- */
-enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_header *hdr)
-{
- struct inode *inode = hdr->inode;
- loff_t offset = hdr->args.offset;
- size_t count = hdr->args.count;
- int err;
- loff_t eof;
-
- eof = i_size_read(inode);
- if (unlikely(offset + count > eof)) {
- if (offset >= eof) {
- err = 0;
- hdr->res.count = 0;
- hdr->res.eof = 1;
- /*FIXME: do we need to call pnfs_ld_read_done() */
- goto out;
- }
- count = eof - offset;
- }
-
- hdr->res.eof = (offset + count) >= eof;
- _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
- &hdr->args.pgbase,
- hdr->args.offset, hdr->args.count);
-
- dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n",
- __func__, inode->i_ino, offset, count, hdr->res.eof);
-
- err = objio_read_pagelist(hdr);
- out:
- if (unlikely(err)) {
- hdr->pnfs_error = err;
- dprintk("%s: Returned Error %d\n", __func__, err);
- return PNFS_NOT_ATTEMPTED;
- }
- return PNFS_ATTEMPTED;
-}
-
-/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
- * This is because the osd completion is called with ints-off from
- * the block layer
- */
-static void _rpc_write_complete(struct work_struct *work)
-{
- struct rpc_task *task;
- struct nfs_pgio_header *hdr;
-
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- hdr = container_of(task, struct nfs_pgio_header, task);
-
- pnfs_ld_write_done(hdr);
-}
-
-void
-objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-{
- struct nfs_pgio_header *hdr = oir->rpcdata;
-
- oir->status = hdr->task.tk_status = status;
- if (status >= 0) {
- hdr->res.count = status;
- hdr->verf.committed = oir->committed;
- } else {
- hdr->pnfs_error = status;
- }
- objlayout_iodone(oir);
- /* must not use oir after this point */
-
- dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
- status, hdr->verf.committed, sync);
-
- if (sync)
- pnfs_ld_write_done(hdr);
- else {
- INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
- schedule_work(&hdr->task.u.tk_work);
- }
-}
-
-/*
- * Perform sync or async writes.
- */
-enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
-{
- int err;
-
- _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
- &hdr->args.pgbase,
- hdr->args.offset, hdr->args.count);
-
- err = objio_write_pagelist(hdr, how);
- if (unlikely(err)) {
- hdr->pnfs_error = err;
- dprintk("%s: Returned Error %d\n", __func__, err);
- return PNFS_NOT_ATTEMPTED;
- }
- return PNFS_ATTEMPTED;
-}
-
-void
-objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *args)
-{
- struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct pnfs_osd_layoutupdate lou;
- __be32 *start;
-
- dprintk("%s: Begin\n", __func__);
-
- spin_lock(&objlay->lock);
- lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
- lou.dsu_delta = objlay->delta_space_used;
- objlay->delta_space_used = 0;
- objlay->delta_space_valid = OBJ_DSU_INIT;
- lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
- spin_unlock(&objlay->lock);
-
- start = xdr_reserve_space(xdr, 4);
-
- BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
-
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
-
- dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
- lou.dsu_delta, lou.olu_ioerr_flag);
-}
-
-static int
-err_prio(u32 oer_errno)
-{
- switch (oer_errno) {
- case 0:
- return 0;
-
- case PNFS_OSD_ERR_RESOURCE:
- return OSD_ERR_PRI_RESOURCE;
- case PNFS_OSD_ERR_BAD_CRED:
- return OSD_ERR_PRI_BAD_CRED;
- case PNFS_OSD_ERR_NO_ACCESS:
- return OSD_ERR_PRI_NO_ACCESS;
- case PNFS_OSD_ERR_UNREACHABLE:
- return OSD_ERR_PRI_UNREACHABLE;
- case PNFS_OSD_ERR_NOT_FOUND:
- return OSD_ERR_PRI_NOT_FOUND;
- case PNFS_OSD_ERR_NO_SPACE:
- return OSD_ERR_PRI_NO_SPACE;
- default:
- WARN_ON(1);
- /* fallthrough */
- case PNFS_OSD_ERR_EIO:
- return OSD_ERR_PRI_EIO;
- }
-}
-
-static void
-merge_ioerr(struct pnfs_osd_ioerr *dest_err,
- const struct pnfs_osd_ioerr *src_err)
-{
- u64 dest_end, src_end;
-
- if (!dest_err->oer_errno) {
- *dest_err = *src_err;
- /* accumulated device must be blank */
- memset(&dest_err->oer_component.oid_device_id, 0,
- sizeof(dest_err->oer_component.oid_device_id));
-
- return;
- }
-
- if (dest_err->oer_component.oid_partition_id !=
- src_err->oer_component.oid_partition_id)
- dest_err->oer_component.oid_partition_id = 0;
-
- if (dest_err->oer_component.oid_object_id !=
- src_err->oer_component.oid_object_id)
- dest_err->oer_component.oid_object_id = 0;
-
- if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
- dest_err->oer_comp_offset = src_err->oer_comp_offset;
-
- dest_end = end_offset(dest_err->oer_comp_offset,
- dest_err->oer_comp_length);
- src_end = end_offset(src_err->oer_comp_offset,
- src_err->oer_comp_length);
- if (dest_end < src_end)
- dest_end = src_end;
-
- dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
-
- if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
- (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
- dest_err->oer_errno = src_err->oer_errno;
- } else if (src_err->oer_iswrite) {
- dest_err->oer_iswrite = true;
- dest_err->oer_errno = src_err->oer_errno;
- }
-}
-
-static void
-encode_accumulated_error(struct objlayout *objlay, __be32 *p)
-{
- struct objlayout_io_res *oir, *tmp;
- struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
-
- list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
- unsigned i;
-
- for (i = 0; i < oir->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
-
- if (!ioerr->oer_errno)
- continue;
-
- printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
- "is_write=%d dev(%llx:%llx) par=0x%llx "
- "obj=0x%llx offset=0x%llx length=0x%llx\n",
- __func__, i, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
-
- merge_ioerr(&accumulated_err, ioerr);
- }
- list_del(&oir->err_list);
- objio_free_result(oir);
- }
-
- pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
-}
-
-void
-objlayout_encode_layoutreturn(struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
-{
- struct pnfs_layout_hdr *pnfslay = args->layout;
- struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct objlayout_io_res *oir, *tmp;
- __be32 *start;
-
- dprintk("%s: Begin\n", __func__);
- start = xdr_reserve_space(xdr, 4);
- BUG_ON(!start);
-
- spin_lock(&objlay->lock);
-
- list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
- __be32 *last_xdr = NULL, *p;
- unsigned i;
- int res = 0;
-
- for (i = 0; i < oir->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
-
- if (!ioerr->oer_errno)
- continue;
-
- dprintk("%s: err[%d]: errno=%d is_write=%d "
- "dev(%llx:%llx) par=0x%llx obj=0x%llx "
- "offset=0x%llx length=0x%llx\n",
- __func__, i, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
-
- p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
- if (unlikely(!p)) {
- res = -E2BIG;
- break; /* accumulated_error */
- }
-
- last_xdr = p;
- pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
- }
-
- /* TODO: use xdr_write_pages */
- if (unlikely(res)) {
- /* no space for even one error descriptor */
- BUG_ON(!last_xdr);
-
- /* we've encountered a situation with lots and lots of
- * errors and no space to encode them all. Use the last
- * available slot to report the union of all the
- * remaining errors.
- */
- encode_accumulated_error(objlay, last_xdr);
- goto loop_done;
- }
- list_del(&oir->err_list);
- objio_free_result(oir);
- }
-loop_done:
- spin_unlock(&objlay->lock);
-
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
- dprintk("%s: Return\n", __func__);
-}
-
-enum {
- OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
- OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
- OSD_LOGIN_UPCALL_PATHLEN = 256
-};
-
-static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
-
-module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
- 0600);
-MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
-
-struct __auto_login {
- char uri[OBJLAYOUT_MAX_URI_LEN];
- char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
- char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
-};
-
-static int __objlayout_upcall(struct __auto_login *login)
-{
- static char *envp[] = { "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL
- };
- char *argv[8];
- int ret;
-
- if (unlikely(!osd_login_prog[0])) {
- dprintk("%s: osd_login_prog is disabled\n", __func__);
- return -EACCES;
- }
-
- dprintk("%s uri: %s\n", __func__, login->uri);
- dprintk("%s osdname %s\n", __func__, login->osdname);
- dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
-
- argv[0] = (char *)osd_login_prog;
- argv[1] = "-u";
- argv[2] = login->uri;
- argv[3] = "-o";
- argv[4] = login->osdname;
- argv[5] = "-s";
- argv[6] = login->systemid_hex;
- argv[7] = NULL;
-
- ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- /*
- * Disable the upcall mechanism if we're getting an ENOENT or
- * EACCES error. The admin can re-enable it on the fly by using
- * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
- * the problem has been fixed.
- */
- if (ret == -ENOENT || ret == -EACCES) {
- printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
- "objlayoutdriver.osd_login_prog kernel parameter!\n",
- osd_login_prog);
- osd_login_prog[0] = '\0';
- }
- dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
-
- return ret;
-}
-
-/* Assume dest is all zeros */
-static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
- char *dest, int max_len,
- const char *var_name)
-{
- if (!s.len)
- return;
-
- if (s.len >= max_len) {
- pr_warn_ratelimited(
- "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
- var_name, s.len, max_len);
- s.len = max_len - 1; /* space for null terminator */
- }
-
- memcpy(dest, s.data, s.len);
-}
-
-/* Assume sysid is all zeros */
-static void _sysid_2_hex(struct nfs4_string s,
- char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
-{
- int i;
- char *cur;
-
- if (!s.len)
- return;
-
- if (s.len != OSD_SYSTEMID_LEN) {
- pr_warn_ratelimited(
- "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
- s.len);
- if (s.len > OSD_SYSTEMID_LEN)
- s.len = OSD_SYSTEMID_LEN;
- }
-
- cur = sysid;
- for (i = 0; i < s.len; i++)
- cur = hex_byte_pack(cur, s.data[i]);
-}
-
-int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- int rc;
- struct __auto_login login;
-
- if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
- return -ENODEV;
-
- memset(&login, 0, sizeof(login));
- __copy_nfsS_and_zero_terminate(
- deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
- login.uri, sizeof(login.uri), "URI");
-
- __copy_nfsS_and_zero_terminate(
- deviceaddr->oda_osdname,
- login.osdname, sizeof(login.osdname), "OSDNAME");
-
- _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
-
- rc = __objlayout_upcall(&login);
- if (rc > 0) /* script returns positive values */
- rc = -ENODEV;
-
- return rc;
-}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
deleted file mode 100644
index fc94a5872ed4..000000000000
--- a/fs/nfs/objlayout/objlayout.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Data types and function declerations for interfacing with the
- * pNFS standard object layout driver.
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _OBJLAYOUT_H
-#define _OBJLAYOUT_H
-
-#include <linux/nfs_fs.h>
-#include <linux/pnfs_osd_xdr.h>
-#include "../pnfs.h"
-
-/*
- * per-inode layout
- */
-struct objlayout {
- struct pnfs_layout_hdr pnfs_layout;
-
- /* for layout_commit */
- enum osd_delta_space_valid_enum {
- OBJ_DSU_INIT = 0,
- OBJ_DSU_VALID,
- OBJ_DSU_INVALID,
- } delta_space_valid;
- s64 delta_space_used; /* consumed by write ops */
-
- /* for layout_return */
- spinlock_t lock;
- struct list_head err_list;
-};
-
-static inline struct objlayout *
-OBJLAYOUT(struct pnfs_layout_hdr *lo)
-{
- return container_of(lo, struct objlayout, pnfs_layout);
-}
-
-/*
- * per-I/O operation state
- * embedded in objects provider io_state data structure
- */
-struct objlayout_io_res {
- struct objlayout *objlay;
-
- void *rpcdata;
- int status; /* res */
- int committed; /* res */
-
- /* Error reporting (layout_return) */
- struct list_head err_list;
- unsigned num_comps;
- /* Pointer to array of error descriptors of size num_comps.
- * It should contain as many entries as devices in the osd_layout
- * that participate in the I/O. It is up to the io_engine to allocate
- * needed space and set num_comps.
- */
- struct pnfs_osd_ioerr *ioerrs;
-};
-
-static inline
-void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
- struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
- struct pnfs_layout_hdr *pnfs_layout_type)
-{
- oir->objlay = OBJLAYOUT(pnfs_layout_type);
- oir->rpcdata = rpcdata;
- INIT_LIST_HEAD(&oir->err_list);
- oir->num_comps = num_comps;
- oir->ioerrs = ioerrs;
-}
-
-/*
- * Raid engine I/O API
- */
-extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
- struct pnfs_layout_hdr *pnfslay,
- struct pnfs_layout_range *range,
- struct xdr_stream *xdr,
- gfp_t gfp_flags);
-extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-
-/* objio_free_result will free these @oir structs received from
- * objlayout_{read,write}_done
- */
-extern void objio_free_result(struct objlayout_io_res *oir);
-
-extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
-
-/*
- * callback API
- */
-extern void objlayout_io_set_result(struct objlayout_io_res *oir,
- unsigned index, struct pnfs_osd_objid *pooid,
- int osd_error, u64 offset, u64 length, bool is_write);
-
-static inline void
-objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
-{
- /* If one of the I/Os errored out and the delta_space_used was
- * invalid we render the complete report as invalid. Protocol mandate
- * the DSU be accurate or not reported.
- */
- spin_lock(&objlay->lock);
- if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
- objlay->delta_space_valid = OBJ_DSU_VALID;
- objlay->delta_space_used += space_used;
- }
- spin_unlock(&objlay->lock);
-}
-
-extern void objlayout_read_done(struct objlayout_io_res *oir,
- ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_res *oir,
- ssize_t status, bool sync);
-
-/*
- * exported generic objects function vectors
- */
-
-extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
-extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
-
-extern struct pnfs_layout_segment *objlayout_alloc_lseg(
- struct pnfs_layout_hdr *,
- struct nfs4_layoutget_res *,
- gfp_t gfp_flags);
-extern void objlayout_free_lseg(struct pnfs_layout_segment *);
-
-extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_pgio_header *);
-
-extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_pgio_header *,
- int how);
-
-extern void objlayout_encode_layoutcommit(
- struct pnfs_layout_hdr *,
- struct xdr_stream *,
- const struct nfs4_layoutcommit_args *);
-
-extern void objlayout_encode_layoutreturn(
- struct xdr_stream *,
- const struct nfs4_layoutreturn_args *);
-
-extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
-
-#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
deleted file mode 100644
index f093c7ec983b..000000000000
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Object-Based pNFS Layout XDR layer
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/pnfs_osd_xdr.h>
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-/*
- * The following implementation is based on RFC5664
- */
-
-/*
- * struct pnfs_osd_objid {
- * struct nfs4_deviceid oid_device_id;
- * u64 oid_partition_id;
- * u64 oid_object_id;
- * }; // xdr size 32 bytes
- */
-static __be32 *
-_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
-{
- p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
- sizeof(objid->oid_device_id.data));
-
- p = xdr_decode_hyper(p, &objid->oid_partition_id);
- p = xdr_decode_hyper(p, &objid->oid_object_id);
- return p;
-}
-/*
- * struct pnfs_osd_opaque_cred {
- * u32 cred_len;
- * void *cred;
- * }; // xdr size [variable]
- * The return pointers are from the xdr buffer
- */
-static int
-_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
- struct xdr_stream *xdr)
-{
- __be32 *p = xdr_inline_decode(xdr, 1);
-
- if (!p)
- return -EINVAL;
-
- opaque_cred->cred_len = be32_to_cpu(*p++);
-
- p = xdr_inline_decode(xdr, opaque_cred->cred_len);
- if (!p)
- return -EINVAL;
-
- opaque_cred->cred = p;
- return 0;
-}
-
-/*
- * struct pnfs_osd_object_cred {
- * struct pnfs_osd_objid oc_object_id;
- * u32 oc_osd_version;
- * u32 oc_cap_key_sec;
- * struct pnfs_osd_opaque_cred oc_cap_key
- * struct pnfs_osd_opaque_cred oc_cap;
- * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
- */
-static int
-_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
- struct xdr_stream *xdr)
-{
- __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
- int ret;
-
- if (!p)
- return -EIO;
-
- p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
- comp->oc_osd_version = be32_to_cpup(p++);
- comp->oc_cap_key_sec = be32_to_cpup(p);
-
- ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
- if (unlikely(ret))
- return ret;
-
- ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
- return ret;
-}
-
-/*
- * struct pnfs_osd_data_map {
- * u32 odm_num_comps;
- * u64 odm_stripe_unit;
- * u32 odm_group_width;
- * u32 odm_group_depth;
- * u32 odm_mirror_cnt;
- * u32 odm_raid_algorithm;
- * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
- */
-static inline int
-_osd_data_map_xdr_sz(void)
-{
- return 4 + 8 + 4 + 4 + 4 + 4;
-}
-
-static __be32 *
-_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
-{
- data_map->odm_num_comps = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
- data_map->odm_group_width = be32_to_cpup(p++);
- data_map->odm_group_depth = be32_to_cpup(p++);
- data_map->odm_mirror_cnt = be32_to_cpup(p++);
- data_map->odm_raid_algorithm = be32_to_cpup(p++);
- dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
- "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
- __func__,
- data_map->odm_num_comps,
- (unsigned long long)data_map->odm_stripe_unit,
- data_map->odm_group_width,
- data_map->odm_group_depth,
- data_map->odm_mirror_cnt,
- data_map->odm_raid_algorithm);
- return p;
-}
-
-int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
- struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
-{
- __be32 *p;
-
- memset(iter, 0, sizeof(*iter));
-
- p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
- if (unlikely(!p))
- return -EINVAL;
-
- p = _osd_xdr_decode_data_map(p, &layout->olo_map);
- layout->olo_comps_index = be32_to_cpup(p++);
- layout->olo_num_comps = be32_to_cpup(p++);
- dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
- layout->olo_comps_index, layout->olo_num_comps);
-
- iter->total_comps = layout->olo_num_comps;
- return 0;
-}
-
-bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
- struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
- int *err)
-{
- BUG_ON(iter->decoded_comps > iter->total_comps);
- if (iter->decoded_comps == iter->total_comps)
- return false;
-
- *err = _osd_xdr_decode_object_cred(comp, xdr);
- if (unlikely(*err)) {
- dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
- "total_comps=%d\n", __func__, *err,
- iter->decoded_comps, iter->total_comps);
- return false; /* stop the loop */
- }
- dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
- "key_len=%u cap_len=%u\n",
- __func__,
- _DEVID_LO(&comp->oc_object_id.oid_device_id),
- _DEVID_HI(&comp->oc_object_id.oid_device_id),
- comp->oc_object_id.oid_partition_id,
- comp->oc_object_id.oid_object_id,
- comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
-
- iter->decoded_comps++;
- return true;
-}
-
-/*
- * Get Device Information Decoding
- *
- * Note: since Device Information is currently done synchronously, all
- * variable strings fields are left inside the rpc buffer and are only
- * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
- * should not be freed while the returned information is in use.
- */
-/*
- *struct nfs4_string {
- * unsigned int len;
- * char *data;
- *}; // size [variable]
- * NOTE: Returned string points to inside the XDR buffer
- */
-static __be32 *
-__read_u8_opaque(__be32 *p, struct nfs4_string *str)
-{
- str->len = be32_to_cpup(p++);
- str->data = (char *)p;
-
- p += XDR_QUADLEN(str->len);
- return p;
-}
-
-/*
- * struct pnfs_osd_targetid {
- * u32 oti_type;
- * struct nfs4_string oti_scsi_device_id;
- * };// size 4 + [variable]
- */
-static __be32 *
-__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
-{
- u32 oti_type;
-
- oti_type = be32_to_cpup(p++);
- targetid->oti_type = oti_type;
-
- switch (oti_type) {
- case OBJ_TARGET_SCSI_NAME:
- case OBJ_TARGET_SCSI_DEVICE_ID:
- p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
- }
-
- return p;
-}
-
-/*
- * struct pnfs_osd_net_addr {
- * struct nfs4_string r_netid;
- * struct nfs4_string r_addr;
- * };
- */
-static __be32 *
-__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
-{
- p = __read_u8_opaque(p, &netaddr->r_netid);
- p = __read_u8_opaque(p, &netaddr->r_addr);
-
- return p;
-}
-
-/*
- * struct pnfs_osd_targetaddr {
- * u32 ota_available;
- * struct pnfs_osd_net_addr ota_netaddr;
- * };
- */
-static __be32 *
-__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
-{
- u32 ota_available;
-
- ota_available = be32_to_cpup(p++);
- targetaddr->ota_available = ota_available;
-
- if (ota_available)
- p = __read_net_addr(p, &targetaddr->ota_netaddr);
-
-
- return p;
-}
-
-/*
- * struct pnfs_osd_deviceaddr {
- * struct pnfs_osd_targetid oda_targetid;
- * struct pnfs_osd_targetaddr oda_targetaddr;
- * u8 oda_lun[8];
- * struct nfs4_string oda_systemid;
- * struct pnfs_osd_object_cred oda_root_obj_cred;
- * struct nfs4_string oda_osdname;
- * };
- */
-
-/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
- * not have an xdr_stream
- */
-static __be32 *
-__read_opaque_cred(__be32 *p,
- struct pnfs_osd_opaque_cred *opaque_cred)
-{
- opaque_cred->cred_len = be32_to_cpu(*p++);
- opaque_cred->cred = p;
- return p + XDR_QUADLEN(opaque_cred->cred_len);
-}
-
-static __be32 *
-__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
-{
- p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
- comp->oc_osd_version = be32_to_cpup(p++);
- comp->oc_cap_key_sec = be32_to_cpup(p++);
-
- p = __read_opaque_cred(p, &comp->oc_cap_key);
- p = __read_opaque_cred(p, &comp->oc_cap);
- return p;
-}
-
-void pnfs_osd_xdr_decode_deviceaddr(
- struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
-{
- p = __read_targetid(p, &deviceaddr->oda_targetid);
-
- p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
-
- p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
- sizeof(deviceaddr->oda_lun));
-
- p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
-
- p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
-
- p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
-
- /* libosd likes this terminated in dbg. It's last, so no problems */
- deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
-}
-
-/*
- * struct pnfs_osd_layoutupdate {
- * u32 dsu_valid;
- * s64 dsu_delta;
- * u32 olu_ioerr_flag;
- * }; xdr size 4 + 8 + 4
- */
-int
-pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
- struct pnfs_osd_layoutupdate *lou)
-{
- __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
-
- if (!p)
- return -E2BIG;
-
- *p++ = cpu_to_be32(lou->dsu_valid);
- if (lou->dsu_valid)
- p = xdr_encode_hyper(p, lou->dsu_delta);
- *p++ = cpu_to_be32(lou->olu_ioerr_flag);
- return 0;
-}
-
-/*
- * struct pnfs_osd_objid {
- * struct nfs4_deviceid oid_device_id;
- * u64 oid_partition_id;
- * u64 oid_object_id;
- * }; // xdr size 32 bytes
- */
-static inline __be32 *
-pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
-{
- p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
- sizeof(object_id->oid_device_id.data));
- p = xdr_encode_hyper(p, object_id->oid_partition_id);
- p = xdr_encode_hyper(p, object_id->oid_object_id);
-
- return p;
-}
-
-/*
- * struct pnfs_osd_ioerr {
- * struct pnfs_osd_objid oer_component;
- * u64 oer_comp_offset;
- * u64 oer_comp_length;
- * u32 oer_iswrite;
- * u32 oer_errno;
- * }; // xdr size 32 + 24 bytes
- */
-void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
-{
- p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
- p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
- p = xdr_encode_hyper(p, ioerr->oer_comp_length);
- *p++ = cpu_to_be32(ioerr->oer_iswrite);
- *p = cpu_to_be32(ioerr->oer_errno);
-}
-
-__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 32 + 24);
- if (unlikely(!p))
- dprintk("%s: out of xdr space\n", __func__);
-
- return p;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6e629b856a00..ad92b401326c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -29,19 +29,6 @@
static struct kmem_cache *nfs_page_cachep;
static const struct rpc_call_ops nfs_pgio_common_ops;
-static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
-{
- p->npages = pagecount;
- if (pagecount <= ARRAY_SIZE(p->page_array))
- p->pagevec = p->page_array;
- else {
- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
- if (!p->pagevec)
- p->npages = 0;
- }
- return p->pagevec != NULL;
-}
-
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
{
@@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
TASK_KILLABLE);
}
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+ struct inode *inode = d_inode(l_ctx->open_context->dentry);
+ bool ret = false;
+
+ if (atomic_read(&l_ctx->io_count) > 0) {
+ rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+ ret = true;
+ }
+
+ if (atomic_read(&l_ctx->io_count) == 0) {
+ rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+ ret = false;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
/*
* nfs_page_group_lock - lock the head of the page group
* @req - request in group that is to be locked
@@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- if (atomic_dec_and_test(&l_ctx->io_count))
+ if (atomic_dec_and_test(&l_ctx->io_count)) {
wake_up_atomic_t(&l_ctx->io_count);
+ if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+ rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+ }
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
const struct nfs_pgio_completion_ops *compl_ops,
const struct nfs_rw_ops *rw_ops,
size_t bsize,
- int io_flags)
+ int io_flags,
+ gfp_t gfp_flags)
{
struct nfs_pgio_mirror *new;
int i;
@@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
/* until we have a request, we don't have an lseg and no
* idea how many mirrors there will be */
new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
- sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+ sizeof(struct nfs_pgio_mirror), gfp_flags);
desc->pg_mirrors_dynamic = new;
desc->pg_mirrors = new;
@@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*last_page;
struct list_head *head = &mirror->pg_list;
struct nfs_commit_info cinfo;
+ struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
+ gfp_t gfp_flags = GFP_KERNEL;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
- nfs_pgio_error(hdr);
- desc->pg_error = -ENOMEM;
- return desc->pg_error;
+
+ if (pagecount <= ARRAY_SIZE(pg_array->page_array))
+ pg_array->pagevec = pg_array->page_array;
+ else {
+ if (hdr->rw_mode == FMODE_WRITE)
+ gfp_flags = GFP_NOIO;
+ pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
+ if (!pg_array->pagevec) {
+ pg_array->npages = 0;
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
}
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
@@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
mirror = &desc->pg_mirrors[midx];
if (!list_empty(&mirror->pg_list)) {
prev = nfs_list_entry(mirror->pg_list.prev);
- if (index != prev->wb_index + 1)
- nfs_pageio_complete_mirror(desc, midx);
+ if (index != prev->wb_index + 1) {
+ nfs_pageio_complete(desc);
+ break;
+ }
}
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dd042498ce7c..c383d0913b54 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
{
+ struct pnfs_layout_segment *lseg;
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
}
static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
@@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg, *next;
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_clear_layoutreturn_info(lo);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
@@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
}
}
}
-EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
/*
* is l2 fully contained in l1?
@@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
+ nfs_commit_inode(&nfsi->vfs_inode, 0);
pnfs_put_layout_hdr(lo);
} else
spin_unlock(&nfsi->vfs_inode.i_lock);
@@ -1209,7 +1215,6 @@ out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
}
-EXPORT_SYMBOL_GPL(_pnfs_return_layout);
int
pnfs_commit_and_return_layout(struct inode *inode)
@@ -1991,6 +1996,8 @@ out_forget:
spin_unlock(&ino->i_lock);
lseg->pls_layout = lo;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ if (!pnfs_layout_is_valid(lo))
+ nfs_commit_inode(ino, 0);
return ERR_PTR(-EAGAIN);
}
@@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
pnfs_set_plh_return_info(lo, range.iomode, 0);
- /* Block LAYOUTGET */
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
@@ -2075,10 +2084,36 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
void
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+{
+ if (pgio->pg_lseg == NULL ||
+ test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ return;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
+
+/*
+ * Check for any intersection between the request and the pgio->pg_lseg,
+ * and if none, put this pgio->pg_lseg away.
+ */
+static void
+pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+}
+
+void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 rd_size = req->wb_bytes;
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
if (pgio->pg_lseg == NULL) {
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
@@ -2109,6 +2144,8 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -2169,16 +2206,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
pgio->pg_lseg->pls_range.length);
req_start = req_offset(req);
- WARN_ON_ONCE(req_start >= seg_end);
+
/* start of request is past the last byte of this segment */
- if (req_start >= seg_end) {
- /* reference the new lseg */
- if (pgio->pg_ops->pg_cleanup)
- pgio->pg_ops->pg_cleanup(pgio);
- if (pgio->pg_ops->pg_init)
- pgio->pg_ops->pg_init(pgio, req);
+ if (req_start >= seg_end)
return 0;
- }
/* adjust 'size' iff there are fewer bytes left in the
* segment than what nfs_generic_pg_test returned */
@@ -2277,8 +2308,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
enum pnfs_try_status trypnfs;
trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
pnfs_write_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -2408,10 +2451,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
enum pnfs_try_status trypnfs;
trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
- if (trypnfs == PNFS_TRY_AGAIN)
- pnfs_read_resend_pnfs(hdr);
- if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
pnfs_read_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 590e1e35781f..99731e3e332f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type {
gfp_t gfp_flags);
int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
- void (*encode_layoutreturn) (struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
- void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
};
@@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
@@ -597,6 +593,16 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
}
+static inline bool
+pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req)
+{
+ u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length);
+ u64 req_last = req_offset(req) + req->wb_bytes;
+
+ return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last,
+ req_offset(req), req_last);
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7250b95549ec..d40755a0984b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
if (list_empty(&bucket->committing))
continue;
- data = nfs_commitdata_alloc();
+ /*
+ * If the layout segment is invalid, then let
+ * pnfs_generic_retry_commit() clean up the bucket.
+ */
+ if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) &&
+ !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags))
+ break;
+ data = nfs_commitdata_alloc(false);
if (!data)
break;
data->ds_commit_index = i;
@@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc();
- if (data != NULL) {
- data->ds_commit_index = -1;
- list_add(&data->pages, &list);
- nreq++;
- } else {
- nfs_retry_commit(mds_pages, NULL, cinfo, 0);
- pnfs_generic_retry_commit(cinfo, 0);
- return -ENOMEM;
- }
+ data = nfs_commitdata_alloc(true);
+ data->ds_commit_index = -1;
+ list_add(&data->pages, &list);
+ nreq++;
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
@@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void)
get_v3_ds_connect = NULL;
}
}
-EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b7bca8303989..9872cf676a50 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(filp);
- return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+ return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
}
/* Helper functions for NFS lock bounds checking */
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index defc9233e985..a8421d9dab6a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep;
static struct nfs_pgio_header *nfs_readhdr_alloc(void)
{
- return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+ struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+ if (p)
+ p->rw_mode = FMODE_READ;
+ return p;
}
static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
@@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_read_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
- server->rsize, 0);
+ server->rsize, 0, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
@@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void)
}
static const struct nfs_rw_ops nfs_rw_read_ops = {
- .rw_mode = FMODE_READ,
.rw_alloc_header = nfs_readhdr_alloc,
.rw_free_header = nfs_readhdr_free,
.rw_done = nfs_readpage_done,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f3822a4a7d5..c5334c0e23a1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2301,7 +2301,7 @@ EXPORT_SYMBOL_GPL(nfs_remount);
/*
* Initialise the common bits of the superblock
*/
-inline void nfs_initialise_sb(struct super_block *sb)
+static void nfs_initialise_sb(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
@@ -2348,7 +2348,8 @@ EXPORT_SYMBOL_GPL(nfs_fill_super);
/*
* Finish setting up a cloned NFS2/3/4 superblock
*/
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+static void nfs_clone_super(struct super_block *sb,
+ struct nfs_mount_info *mount_info)
{
const struct super_block *old_sb = mount_info->cloned->sb;
struct nfs_server *server = NFS_SB(sb);
@@ -2544,10 +2545,25 @@ EXPORT_SYMBOL_GPL(nfs_set_sb_security);
int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
+ int error;
+ unsigned long kflags = 0, kflags_out = 0;
+
/* clone any lsm security options from the parent to the new sb */
if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
return -ESTALE;
- return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
+
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+ error = security_sb_clone_mnt_opts(mount_info->cloned->sb, s, kflags,
+ &kflags_out);
+ if (error)
+ return error;
+
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cc341fc7fd44..db7ba542559e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool;
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
{
- struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+ struct nfs_commit_data *p;
- if (p) {
- memset(p, 0, sizeof(*p));
- INIT_LIST_HEAD(&p->pages);
+ if (never_fail)
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+ else {
+ /* It is OK to do some reclaim, not no safe to wait
+ * for anything to be returned to the pool.
+ * mempool_alloc() cannot handle that particular combination,
+ * so we need two separate attempts.
+ */
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+ if (!p)
+ p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
+ __GFP_NOWARN | __GFP_NORETRY);
+ if (!p)
+ return NULL;
}
+
+ memset(p, 0, sizeof(*p));
+ INIT_LIST_HEAD(&p->pages);
return p;
}
EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
@@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
- if (p)
+ if (p) {
memset(p, 0, sizeof(*p));
+ p->rw_mode = FMODE_WRITE;
+ }
return p;
}
@@ -547,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req)
{
nfs_unlock_request(req);
nfs_end_page_writeback(req);
- nfs_release_request(req);
generic_error_remove_page(page_file_mapping(req->wb_page),
req->wb_page);
+ nfs_release_request(req);
+}
+
+static bool
+nfs_error_is_fatal_on_server(int err)
+{
+ switch (err) {
+ case 0:
+ case -ERESTARTSYS:
+ case -EINTR:
+ return false;
+ }
+ return nfs_error_is_fatal(err);
}
/*
@@ -557,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req)
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock,
- bool launder)
+ struct page *page, bool nonblock)
{
struct nfs_page *req;
int ret = 0;
@@ -574,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
ret = 0;
+ /* If there is a fatal error that covers this write, just exit */
+ if (nfs_error_is_fatal_on_server(req->wb_context->error))
+ goto out_launder;
+
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
- * Remove the problematic req upon fatal errors
- * in launder case, while other dirty pages can
- * still be around until they get flushed.
+ * Remove the problematic req upon fatal errors on the server
*/
if (nfs_error_is_fatal(ret)) {
nfs_context_set_write_error(req->wb_context, ret);
- if (launder) {
- nfs_write_error_remove_page(req);
- goto out;
- }
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
}
nfs_redirty_request(req);
ret = -EAGAIN;
@@ -595,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
NFSIOS_WRITEPAGES, 1);
out:
return ret;
+out_launder:
+ nfs_write_error_remove_page(req);
+ return ret;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio, bool launder)
+ struct nfs_pageio_descriptor *pgio)
{
int ret;
nfs_pageio_cond_complete(pgio, page_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
- launder);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -616,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
* Write an mmapped page to the server.
*/
static int nfs_writepage_locked(struct page *page,
- struct writeback_control *wbc,
- bool launder)
+ struct writeback_control *wbc)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -626,7 +654,7 @@ static int nfs_writepage_locked(struct page *page,
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio, launder);
+ err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -639,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc, false);
+ ret = nfs_writepage_locked(page, wbc);
unlock_page(page);
return ret;
}
@@ -648,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data, false);
+ ret = nfs_do_writepage(page, wbc, data);
unlock_page(page);
return ret;
}
@@ -1367,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_write_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
- server->wsize, ioflags);
+ server->wsize, ioflags, GFP_NOIO);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
@@ -1704,50 +1732,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc();
-
- if (!data)
- goto out_bad;
+ data = nfs_commitdata_alloc(true);
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
data->mds_ops, how, 0);
- out_bad:
- nfs_retry_commit(head, NULL, cinfo, 0);
- return -ENOMEM;
-}
-
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
-{
- struct inode *inode = file_inode(file);
- struct nfs_open_context *open;
- struct nfs_commit_info cinfo;
- struct nfs_page *req;
- int ret;
-
- open = get_nfs_open_context(nfs_file_open_context(file));
- req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
- goto out_put;
- }
-
- nfs_init_cinfo_from_inode(&cinfo, inode);
-
- memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
- nfs_request_add_commit_list(req, &cinfo);
- ret = nfs_commit_inode(inode, FLUSH_SYNC);
- if (ret > 0)
- ret = 0;
-
- nfs_free_request(req);
-out_put:
- put_nfs_open_context(open);
- return ret;
}
-EXPORT_SYMBOL_GPL(nfs_commit_file);
/*
* COMMIT call returned
@@ -1985,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
+int nfs_wb_page(struct inode *inode, struct page *page)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
@@ -2002,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc, launder);
+ ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out_error;
continue;
@@ -2107,7 +2099,6 @@ void nfs_destroy_writepagecache(void)
}
static const struct nfs_rw_ops nfs_rw_write_ops = {
- .rw_mode = FMODE_WRITE,
.rw_alloc_header = nfs_writehdr_alloc,
.rw_free_header = nfs_writehdr_free,
.rw_done = nfs_writeback_done,
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index fb5213afc854..c862c2489df0 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
u8 *buf, *d, type, assoc;
int error;
+ if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
+ return -EINVAL;
+
buf = kzalloc(bufflen, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
goto out_free_buf;
}
req = scsi_req(rq);
- scsi_req_init(rq);
error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
if (error)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index e71f11b1a180..3bc08c394a3f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -486,7 +486,7 @@ secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
#endif
static inline int
-uuid_parse(char **mesg, char *buf, unsigned char **puuid)
+nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
{
int len;
@@ -586,7 +586,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (strcmp(buf, "fsloc") == 0)
err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
else if (strcmp(buf, "uuid") == 0)
- err = uuid_parse(&mesg, buf, &exp.ex_uuid);
+ err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
else if (strcmp(buf, "secinfo") == 0)
err = secinfo_parse(&mesg, buf, &exp);
else
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d86031b6ad79..dadb3bf305b2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
return NULL;
}
- if (!(exp->ex_layout_types & (1 << layout_type))) {
+ if (layout_type >= LAYOUT_TYPE_MAX ||
+ !(exp->ex_layout_types & (1 << layout_type))) {
dprintk("%s: layout type %d not supported\n",
__func__, layout_type);
return NULL;
@@ -1768,6 +1769,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
opdesc->op_get_currentstateid(cstate, &op->u);
op->status = opdesc->op_func(rqstp, cstate, &op->u);
+ /* Only from SEQUENCE */
+ if (cstate->status == nfserr_replay_cache) {
+ dprintk("%s NFS4.1 replay from cache\n", __func__);
+ status = op->status;
+ goto out;
+ }
if (!op->status) {
if (opdesc->op_set_currentstateid)
opdesc->op_set_currentstateid(cstate, &op->u);
@@ -1778,14 +1785,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
if (need_wrongsec_check(rqstp))
op->status = check_nfsd_access(current_fh->fh_export, rqstp);
}
-
encode_op:
- /* Only from SEQUENCE */
- if (cstate->status == nfserr_replay_cache) {
- dprintk("%s NFS4.1 replay from cache\n", __func__);
- status = op->status;
- goto out;
- }
if (op->status == nfserr_replay_me) {
op->replay = &cstate->replay_owner->so_replay;
nfsd4_encode_replay(&resp->xdr, op);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e9ef50addddb..22002fb75a18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-int strdup_if_nonnull(char **target, char *source)
-{
- if (source) {
- *target = kstrdup(source, GFP_KERNEL);
- if (!*target)
- return -ENOMEM;
- } else
- *target = NULL;
- return 0;
-}
-
static int copy_cred(struct svc_cred *target, struct svc_cred *source)
{
- int ret;
+ target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
+ target->cr_raw_principal = kstrdup(source->cr_raw_principal,
+ GFP_KERNEL);
+ if ((source->cr_principal && ! target->cr_principal) ||
+ (source->cr_raw_principal && ! target->cr_raw_principal))
+ return -ENOMEM;
- ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
- if (ret)
- return ret;
- ret = strdup_if_nonnull(&target->cr_raw_principal,
- source->cr_raw_principal);
- if (ret)
- return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 33017d652b1d..26780d53a6f9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2831,9 +2831,14 @@ out_acl:
}
#endif /* CONFIG_NFSD_PNFS */
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
- NFSD_SUPPATTR_EXCLCREAT_WORD1,
- NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+ supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+ supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+ supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+ status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
if (status)
goto out;
}
@@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_getdeviceinfo *gdev)
{
struct xdr_stream *xdr = &resp->xdr;
- const struct nfsd4_layout_ops *ops =
- nfsd4_layout_ops[gdev->gd_layout_type];
+ const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
__be32 *p;
@@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
/* If maxcount is 0 then just update notifications */
if (gdev->gd_maxcount != 0) {
+ ops = nfsd4_layout_ops[gdev->gd_layout_type];
nfserr = ops->encode_getdeviceinfo(xdr, gdev);
if (nfserr) {
/*
@@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutget *lgp)
{
struct xdr_stream *xdr = &resp->xdr;
- const struct nfsd4_layout_ops *ops =
- nfsd4_layout_ops[lgp->lg_layout_type];
+ const struct nfsd4_layout_ops *ops;
__be32 *p;
dprintk("%s: err %d\n", __func__, nfserr);
@@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(lgp->lg_seg.iomode);
*p++ = cpu_to_be32(lgp->lg_layout_type);
+ ops = nfsd4_layout_ops[lgp->lg_layout_type];
nfserr = ops->encode_layoutget(xdr, lgp);
out:
kfree(lgp->lg_content);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 8bf8f667a8cf..6493df6b1bd5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
- static struct tree_descr nfsd_files[] = {
+ static const struct tree_descr nfsd_files[] = {
[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9aaf6ca77569..38d0383dc7f9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
err = follow_down(&path);
if (err < 0)
goto out;
+ if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
+ nfsd_mountpoint(dentry, exp) == 2) {
+ /* This is only a mountpoint in some other namespace */
+ path_put(&path);
+ goto out;
+ }
exp2 = rqst_exp_get_by_name(rqstp, &path);
if (IS_ERR(exp2)) {
@@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st
/*
* For nfsd purposes, we treat V4ROOT exports as though there was an
* export at *every* directory.
+ * We return:
+ * '1' if this dentry *must* be an export point,
+ * '2' if it might be, if there is really a mount here, and
+ * '0' if there is no chance of an export point here.
*/
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{
- if (d_mountpoint(dentry))
+ if (!d_inode(dentry))
+ return 0;
+ if (exp->ex_flags & NFSEXP_V4ROOT)
return 1;
if (nfsd4_is_junction(dentry))
return 1;
- if (!(exp->ex_flags & NFSEXP_V4ROOT))
- return 0;
- return d_inode(dentry) != NULL;
+ if (d_mountpoint(dentry))
+ /*
+ * Might only be a mountpoint in a different namespace,
+ * but we need to check.
+ */
+ return 2;
+ return 0;
}
__be32
@@ -895,24 +911,13 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp,
__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
unsigned long *count)
{
- mm_segment_t oldfs;
+ struct iov_iter iter;
int host_err;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
- set_fs(oldfs);
- return nfsd_finish_read(file, count, host_err);
-}
+ iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
+ host_err = vfs_iter_read(file, &iter, &offset, 0);
-static __be32
-nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
-{
- if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
- return nfsd_splice_read(rqstp, file, offset, count);
- else
- return nfsd_readv(file, offset, vec, vlen, count);
+ return nfsd_finish_read(file, count, host_err);
}
/*
@@ -958,7 +963,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
unsigned long *cnt, int stable)
{
struct svc_export *exp;
- mm_segment_t oldfs;
+ struct iov_iter iter;
__be32 err = 0;
int host_err;
int use_wgather;
@@ -984,10 +989,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (stable && !use_wgather)
flags |= RWF_SYNC;
- /* Write the data. */
- oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags);
- set_fs(oldfs);
+ iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt);
+ host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0)
goto out_nfserr;
*cnt = host_err;
@@ -1028,7 +1031,12 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ra = nfsd_init_raparms(file);
trace_read_opened(rqstp, fhp, offset, vlen);
- err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
+ err = nfsd_splice_read(rqstp, file, offset, count);
+ else
+ err = nfsd_readv(file, offset, vec, vlen, count);
+
trace_read_io_done(rqstp, fhp, offset, vlen);
if (ra)
@@ -1448,41 +1456,34 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32
nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
{
- mm_segment_t oldfs;
__be32 err;
- int host_err;
+ const char *link;
struct path path;
+ DEFINE_DELAYED_CALL(done);
+ int len;
err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
- if (err)
- goto out;
+ if (unlikely(err))
+ return err;
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
- err = nfserr_inval;
- if (!d_is_symlink(path.dentry))
- goto out;
+ if (unlikely(!d_is_symlink(path.dentry)))
+ return nfserr_inval;
touch_atime(&path);
- /* N.B. Why does this call need a get_fs()??
- * Remove the set_fs and watch the fireworks:-) --okir
- */
- oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp);
- set_fs(oldfs);
+ link = vfs_get_link(path.dentry, &done);
+ if (IS_ERR(link))
+ return nfserrno(PTR_ERR(link));
- if (host_err < 0)
- goto out_nfserr;
- *lenp = host_err;
- err = 0;
-out:
- return err;
-
-out_nfserr:
- err = nfserrno(host_err);
- goto out;
+ len = strlen(link);
+ if (len < *lenp)
+ *lenp = len;
+ memcpy(buf, link, *lenp);
+ do_delayed_call(&done);
+ return 0;
}
/*
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6f87b2ac1aeb..e73c86d9855c 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio)
{
struct nilfs_segment_buffer *segbuf = bio->bi_private;
- if (bio->bi_error)
+ if (bio->bi_status)
atomic_inc(&segbuf->sb_err);
bio_put(bio);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index febed1217b3f..70ded52dc1dd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
}
struct nilfs_segctor_wait_request {
- wait_queue_t wq;
+ wait_queue_entry_t wq;
__u32 seq;
int err;
atomic_t done;
@@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
unsigned long flags;
spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
- list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
- wq.task_list) {
+ list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) {
if (!atomic_read(&wrq->done) &&
nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
wrq->err = err;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 01a9f0f007d4..0c4583b61717 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -161,16 +161,20 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
if (unlikely(!fsnotify_inode_watches_children(p_inode)))
__fsnotify_update_child_dentry_flags(p_inode);
else if (p_inode->i_fsnotify_mask & mask) {
+ struct name_snapshot name;
+
/* we are notifying a parent so come up with the new mask which
* specifies these are events which came from a child. */
mask |= FS_EVENT_ON_CHILD;
+ take_dentry_name_snapshot(&name, dentry);
if (path)
ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
- dentry->d_name.name, 0);
+ name.name, 0);
else
ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
- dentry->d_name.name, 0);
+ name.name, 0);
+ release_dentry_name_snapshot(&name);
}
dput(parent);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 323f492e0822..f3db56e83dd2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -196,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
{
struct ns_common *ns;
int res = -ENOENT;
+ const char *name;
ns = ns_ops->get(task);
if (ns) {
- res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ name = ns_ops->real_ns_name ? : ns_ops->name;
+ res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
ns_ops->put(ns);
}
return res;
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 358258364616..4690cd75d8d7 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -159,7 +159,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
PTR_ERR(dent_inode));
kfree(name);
/* Return the error code. */
- return (struct dentry *)dent_inode;
+ return ERR_CAST(dent_inode);
}
/* It is guaranteed that @name is no longer allocated at this point. */
if (MREF_ERR(mref) == -ENOENT) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 0da0332725aa..ffe003982d95 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
- if (bio->bi_error) {
- mlog(ML_ERROR, "IO Error %d\n", bio->bi_error);
- wc->wc_error = bio->bi_error;
+ if (bio->bi_status) {
+ mlog(ML_ERROR, "IO Error %d\n", bio->bi_status);
+ wc->wc_error = blk_status_to_errno(bio->bi_status);
}
o2hb_bio_wait_dec(wc, 1);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 564c504d6efd..74a21f6695c8 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
+ kfree(dummy_sc);
return seq_release_private(inode, file);
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3b7c937a36b5..4689940a953c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode,
struct ocfs2_lock_res *lockres;
lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ /* had_lock means that the currect process already takes the cluster
+ * lock previously. If had_lock is 1, we have nothing to do here, and
+ * it will get unlocked where we got the lock.
+ */
if (!had_lock) {
ocfs2_remove_holder(lockres, oh);
ocfs2_inode_unlock(inode, ex);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 827fc9809bc2..9f88188060db 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -119,7 +119,7 @@ check_err:
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
- result = (void *)inode;
+ result = ERR_CAST(inode);
goto bail;
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 382401d3e88f..1a1e0078ab38 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
- int rc = 0;
+ int rc = -ESTALE;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 44d178b8d1aa..5bb4a89f9045 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -25,6 +25,8 @@
#ifndef _OCFS2_FS_H
#define _OCFS2_FS_H
+#include <linux/magic.h>
+
/* Version */
#define OCFS2_MAJOR_REV_LEVEL 0
#define OCFS2_MINOR_REV_LEVEL 90
@@ -56,9 +58,6 @@
#define OCFS2_MIN_BLOCKSIZE 512
#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
-/* Filesystem magic number */
-#define OCFS2_SUPER_MAGIC 0x7461636f
-
/* Object signatures */
#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
#define OCFS2_INODE_SIGNATURE "INODE01"
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 820359096c7a..d6c350ba25b9 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -631,7 +631,7 @@ static struct attribute *ocfs2_attrs[] = {
NULL,
};
-static struct attribute_group ocfs2_attr_group = {
+static const struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ca1646fbcaef..83005f486451 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2062,7 +2062,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
- memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+ memcpy(&sb->s_uuid, di->id2.i_super.s_uuid,
sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3c5384d9b3a5..f70c3778d600 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode,
void *buffer,
size_t buffer_size)
{
- int ret;
+ int ret, had_lock;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_lock_holder oh;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+ if (had_lock < 0) {
+ mlog_errno(had_lock);
+ return had_lock;
}
down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
@@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret, credits, ref_meta = 0, ref_credits = 0;
+ int ret, credits, had_lock, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct ocfs2_lock_holder oh;
struct ocfs2_xattr_info xi = {
.xi_name_index = name_index,
@@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode,
return -ENOMEM;
}
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh);
+ if (had_lock < 0) {
+ ret = had_lock;
mlog_errno(ret);
goto cleanup_nolock;
}
@@ -3670,7 +3673,7 @@ cleanup:
if (ret)
mlog_errno(ret);
}
- ocfs2_inode_unlock(inode, 1);
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
cleanup_nolock:
brelse(di_bh);
brelse(xbs.xattr_bh);
diff --git a/fs/open.c b/fs/open.c
index 4d23f729dcc6..35bb784763a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
goto out_putf;
error = -EPERM;
- if (IS_APPEND(inode))
+ /* Check IS_APPEND on real upper inode */
+ if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
@@ -459,20 +460,17 @@ out:
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
- struct inode *inode;
- int error = -EBADF;
+ int error;
error = -EBADF;
if (!f.file)
goto out;
- inode = file_inode(f.file);
-
error = -ENOTDIR;
- if (!S_ISDIR(inode->i_mode))
+ if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
@@ -709,6 +707,9 @@ static int do_dentry_open(struct file *f,
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
+ /* Ensure that we skip any errors that predate opening of the file */
+ f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
+
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
@@ -761,6 +762,7 @@ static int do_dentry_open(struct file *f,
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
+ f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -900,6 +902,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
+ /*
+ * Clear out all open flags we don't know about so that we don't report
+ * them in fcntl(F_GETFD) or similar interfaces.
+ */
+ flags &= VALID_OPEN_FLAGS;
+
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 83b506020718..038d67545d9f 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -46,8 +46,8 @@ static void run_down(struct slot_map *m)
spin_lock(&m->q.lock);
if (m->c != -1) {
for (;;) {
- if (likely(list_empty(&wait.task_list)))
- __add_wait_queue_tail(&m->q, &wait);
+ if (likely(list_empty(&wait.entry)))
+ __add_wait_queue_entry_tail(&m->q, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
if (m->c == -1)
@@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m)
do {
long n = left, t;
- if (likely(list_empty(&wait.task_list)))
- __add_wait_queue_tail_exclusive(&m->q, &wait);
+ if (likely(list_empty(&wait.entry)))
+ __add_wait_queue_entry_tail_exclusive(&m->q, &wait);
set_current_state(TASK_INTERRUPTIBLE);
if (m->c > 0)
@@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m)
left = -EINTR;
} while (left > 0);
- if (!list_empty(&wait.task_list))
- list_del(&wait.task_list);
+ if (!list_empty(&wait.entry))
+ list_del(&wait.entry);
else if (left <= 0 && waitqueue_active(&m->q))
__wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
__set_current_state(TASK_RUNNING);
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 0daac5112f7a..c0c9683934b7 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,5 +1,6 @@
config OVERLAY_FS
tristate "Overlay filesystem support"
+ select EXPORTFS
help
An overlay filesystem combines two filesystems - an 'upper' filesystem
and a 'lower' filesystem. When a name exists in both filesystems, the
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 906ea6c93260..e5869f91b3ab 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -232,6 +233,82 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
return err;
}
+static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_t *uuid)
+{
+ struct ovl_fh *fh;
+ int fh_type, fh_len, dwords;
+ void *buf;
+ int buflen = MAX_HANDLE_SZ;
+
+ buf = kmalloc(buflen, GFP_TEMPORARY);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We encode a non-connectable file handle for non-dir, because we
+ * only need to find the lower inode number and we don't want to pay
+ * the price or reconnecting the dentry.
+ */
+ dwords = buflen >> 2;
+ fh_type = exportfs_encode_fh(lower, buf, &dwords, 0);
+ buflen = (dwords << 2);
+
+ fh = ERR_PTR(-EIO);
+ if (WARN_ON(fh_type < 0) ||
+ WARN_ON(buflen > MAX_HANDLE_SZ) ||
+ WARN_ON(fh_type == FILEID_INVALID))
+ goto out;
+
+ BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
+ fh_len = offsetof(struct ovl_fh, fid) + buflen;
+ fh = kmalloc(fh_len, GFP_KERNEL);
+ if (!fh) {
+ fh = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fh->version = OVL_FH_VERSION;
+ fh->magic = OVL_FH_MAGIC;
+ fh->type = fh_type;
+ fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
+ fh->len = fh_len;
+ fh->uuid = *uuid;
+ memcpy(fh->fid, buf, buflen);
+
+out:
+ kfree(buf);
+ return fh;
+}
+
+static int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
+ struct dentry *upper)
+{
+ struct super_block *sb = lower->d_sb;
+ const struct ovl_fh *fh = NULL;
+ int err;
+
+ /*
+ * When lower layer doesn't support export operations store a 'null' fh,
+ * so we can use the overlay.origin xattr to distignuish between a copy
+ * up and a pure upper inode.
+ */
+ if (sb->s_export_op && sb->s_export_op->fh_to_dentry &&
+ !uuid_is_null(&sb->s_uuid)) {
+ fh = ovl_encode_fh(lower, &sb->s_uuid);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+ }
+
+ /*
+ * Do not fail when upper doesn't support xattrs.
+ */
+ err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
+ fh ? fh->len : 0, 0);
+ kfree(fh);
+
+ return err;
+}
+
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link,
@@ -252,15 +329,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
.link = link
};
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
- err = PTR_ERR(upper);
- if (IS_ERR(upper))
- goto out;
-
err = security_inode_copy_up(dentry, &new_creds);
if (err < 0)
- goto out1;
+ goto out;
if (new_creds)
old_creds = override_creds(new_creds);
@@ -268,13 +339,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (tmpfile)
temp = ovl_do_tmpfile(upperdir, stat->mode);
else
- temp = ovl_lookup_temp(workdir, dentry);
- err = PTR_ERR(temp);
- if (IS_ERR(temp))
- goto out1;
-
+ temp = ovl_lookup_temp(workdir);
err = 0;
- if (!tmpfile)
+ if (IS_ERR(temp)) {
+ err = PTR_ERR(temp);
+ temp = NULL;
+ }
+
+ if (!err && !tmpfile)
err = ovl_create_real(wdir, temp, &cattr, NULL, true);
if (new_creds) {
@@ -283,7 +355,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
}
if (err)
- goto out2;
+ goto out;
if (S_ISREG(stat->mode)) {
struct path upperpath;
@@ -316,6 +388,27 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (err)
goto out_cleanup;
+ /*
+ * Store identifier of lower inode in upper inode xattr to
+ * allow lookup of the copy up origin inode.
+ *
+ * Don't set origin when we are breaking the association with a lower
+ * hard link.
+ */
+ if (S_ISDIR(stat->mode) || stat->nlink == 1) {
+ err = ovl_set_origin(dentry, lowerpath->dentry, temp);
+ if (err)
+ goto out_cleanup;
+ }
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ if (IS_ERR(upper)) {
+ err = PTR_ERR(upper);
+ upper = NULL;
+ goto out_cleanup;
+ }
+
if (tmpfile)
err = ovl_do_link(temp, udir, upper, true);
else
@@ -329,17 +422,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, pstat);
-out2:
+out:
dput(temp);
-out1:
dput(upper);
-out:
return err;
out_cleanup:
if (!tmpfile)
ovl_cleanup(wdir, temp);
- goto out2;
+ goto out;
}
/*
@@ -372,6 +463,11 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
+ /* Mark parent "impure" because it may now contain non-pure upper */
+ err = ovl_set_impure(parent, upperdir);
+ if (err)
+ return err;
+
err = vfs_getattr(&parentpath, &pstat,
STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT);
if (err)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6515796460df..a63a71656e9b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -41,7 +41,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
}
}
-struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+struct dentry *ovl_lookup_temp(struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -68,7 +68,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
- whiteout = ovl_lookup_temp(workdir, dentry);
+ whiteout = ovl_lookup_temp(workdir);
if (IS_ERR(whiteout))
return whiteout;
@@ -127,45 +127,26 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
return err;
}
-static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
+static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
+ int xerr)
{
int err;
- err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
+ err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr);
if (!err)
ovl_dentry_set_opaque(dentry);
return err;
}
-static int ovl_dir_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
{
- struct dentry *dentry = path->dentry;
- int err;
- enum ovl_path_type type;
- struct path realpath;
- const struct cred *old_cred;
-
- type = ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr(&realpath, stat, request_mask, flags);
- revert_creds(old_cred);
- if (err)
- return err;
-
- stat->dev = dentry->d_sb->s_dev;
- stat->ino = dentry->d_inode->i_ino;
-
/*
- * It's probably not worth it to count subdirs to get the
- * correct link count. nlink=1 seems to pacify 'find' and
- * other utilities.
+ * Fail with -EIO when trying to create opaque dir and upper doesn't
+ * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to
+ * return a specific error for noxattr case.
*/
- if (OVL_TYPE_MERGE(type))
- stat->nlink = 1;
-
- return 0;
+ return ovl_set_opaque_xerr(dentry, upperdentry, -EIO);
}
/* Common operations required to be done after creation of file on upper */
@@ -182,6 +163,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
inc_nlink(inode);
}
d_instantiate(dentry, inode);
+ /* Force lookup of new upper hardlink to find its lower */
+ if (hardlink)
+ d_drop(dentry);
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -189,6 +173,11 @@ static bool ovl_type_merge(struct dentry *dentry)
return OVL_TYPE_MERGE(ovl_path_type(dentry));
}
+static bool ovl_type_origin(struct dentry *dentry)
+{
+ return OVL_TYPE_ORIGIN(ovl_path_type(dentry));
+}
+
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct cattr *attr, struct dentry *hardlink)
{
@@ -210,7 +199,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput;
- if (ovl_type_merge(dentry->d_parent)) {
+ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) {
/* Setting opaque here is just an optimization, allow to fail */
ovl_set_opaque(dentry, newdentry);
}
@@ -277,7 +266,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (upper->d_parent->d_inode != udir)
goto out_unlock;
- opaquedir = ovl_lookup_temp(workdir, dentry);
+ opaquedir = ovl_lookup_temp(workdir);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
@@ -409,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- newdentry = ovl_lookup_temp(workdir, dentry);
+ newdentry = ovl_lookup_temp(workdir);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
@@ -873,18 +862,16 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
if (IS_ERR(redirect))
return PTR_ERR(redirect);
- err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT,
- redirect, strlen(redirect), 0);
+ err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry),
+ OVL_XATTR_REDIRECT,
+ redirect, strlen(redirect), -EXDEV);
if (!err) {
spin_lock(&dentry->d_lock);
ovl_dentry_set_redirect(dentry, redirect);
spin_unlock(&dentry->d_lock);
} else {
kfree(redirect);
- if (err == -EOPNOTSUPP)
- ovl_clear_redirect_dir(dentry->d_sb);
- else
- pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
+ pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
/* Fall back to userspace copy-up */
err = -EXDEV;
}
@@ -970,6 +957,25 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
+ if (!samedir) {
+ /*
+ * When moving a merge dir or non-dir with copy up origin into
+ * a new parent, we are marking the new parent dir "impure".
+ * When ovl_iterate() iterates an "impure" upper dir, it will
+ * lookup the origin inodes of the entries to fill d_ino.
+ */
+ if (ovl_type_origin(old)) {
+ err = ovl_set_impure(new->d_parent, new_upperdir);
+ if (err)
+ goto out_revert_creds;
+ }
+ if (!overwrite && ovl_type_origin(new)) {
+ err = ovl_set_impure(old->d_parent, old_upperdir);
+ if (err)
+ goto out_revert_creds;
+ }
+ }
+
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
@@ -1019,7 +1025,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (ovl_type_merge_or_lower(old))
err = ovl_set_redirect(old, samedir);
else if (!old_opaque && ovl_type_merge(new->d_parent))
- err = ovl_set_opaque(old, olddentry);
+ err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
if (err)
goto out_dput;
}
@@ -1027,7 +1033,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (ovl_type_merge_or_lower(new))
err = ovl_set_redirect(new, samedir);
else if (!new_opaque && ovl_type_merge(old->d_parent))
- err = ovl_set_opaque(new, newdentry);
+ err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
if (err)
goto out_dput;
}
@@ -1070,7 +1076,7 @@ const struct inode_operations ovl_dir_inode_operations = {
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
- .getattr = ovl_dir_getattr,
+ .getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index f8fe6bf2036d..d613e2c41242 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -57,18 +57,78 @@ out:
return err;
}
-static int ovl_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
+ enum ovl_path_type type;
struct path realpath;
const struct cred *old_cred;
+ bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
int err;
- ovl_path_real(dentry, &realpath);
+ type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat, request_mask, flags);
+ if (err)
+ goto out;
+
+ /*
+ * When all layers are on the same fs, all real inode number are
+ * unique, so we use the overlay st_dev, which is friendly to du -x.
+ *
+ * We also use st_ino of the copy up origin, if we know it.
+ * This guaranties constant st_dev/st_ino across copy up.
+ *
+ * If filesystem supports NFS export ops, this also guaranties
+ * persistent st_ino across mount cycle.
+ */
+ if (ovl_same_sb(dentry->d_sb)) {
+ if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat lowerstat;
+ u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
+
+ ovl_path_lower(dentry, &realpath);
+ err = vfs_getattr(&realpath, &lowerstat,
+ lowermask, flags);
+ if (err)
+ goto out;
+
+ WARN_ON_ONCE(stat->dev != lowerstat.dev);
+ /*
+ * Lower hardlinks are broken on copy up to different
+ * upper files, so we cannot use the lower origin st_ino
+ * for those different files, even for the same fs case.
+ */
+ if (is_dir || lowerstat.nlink == 1)
+ stat->ino = lowerstat.ino;
+ }
+ stat->dev = dentry->d_sb->s_dev;
+ } else if (is_dir) {
+ /*
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino.
+ *
+ * Always use the overlay st_dev for directories, so 'find
+ * -xdev' will scan the entire overlay mount and won't cross the
+ * overlay mount boundaries.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+ }
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (is_dir && OVL_TYPE_MERGE(type))
+ stat->nlink = 1;
+
+out:
revert_creds(old_cred);
+
return err;
}
@@ -180,6 +240,16 @@ int ovl_xattr_get(struct dentry *dentry, const char *name,
return res;
}
+static bool ovl_can_list(const char *s)
+{
+ /* List all non-trusted xatts */
+ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+ return true;
+
+ /* Never list trusted.overlay, list other trusted for superuser only */
+ return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
+}
+
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct dentry *realdentry = ovl_dentry_real(dentry);
@@ -203,7 +273,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return -EIO;
len -= slen;
- if (ovl_is_private_xattr(s)) {
+ if (!ovl_can_list(s)) {
res -= slen;
memmove(s, s + slen, len);
} else {
@@ -303,6 +373,41 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.update_time = ovl_update_time,
};
+/*
+ * It is possible to stack overlayfs instance on top of another
+ * overlayfs instance as lower layer. We need to annonate the
+ * stackable i_mutex locks according to stack level of the super
+ * block instance. An overlayfs instance can never be in stack
+ * depth 0 (there is always a real fs below it). An overlayfs
+ * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
+ *
+ * For example, here is a snip from /proc/lockdep_chains after
+ * dir_iterate of nested overlayfs:
+ *
+ * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
+ * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
+ * [...] &type->i_mutex_dir_key (stack_depth=0)
+ */
+#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
+
+static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+
+ int depth = inode->i_sb->s_stack_depth - 1;
+
+ if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
+ depth = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
+ else
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+#endif
+}
+
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_ino = get_next_ino();
@@ -312,6 +417,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif
+ ovl_lockdep_annotate_inode_mutex_key(inode);
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index b8b077821fb0..de0d4f742f36 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -12,6 +12,8 @@
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -81,19 +83,93 @@ invalid:
goto err_free;
}
-static bool ovl_is_opaquedir(struct dentry *dentry)
+static int ovl_acceptable(void *ctx, struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry *ovl_get_origin(struct dentry *dentry,
+ struct vfsmount *mnt)
{
int res;
- char val;
+ struct ovl_fh *fh = NULL;
+ struct dentry *origin = NULL;
+ int bytes;
- if (!d_is_dir(dentry))
- return false;
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return NULL;
+ goto fail;
+ }
+ /* Zero size value means "copied up but origin unknown" */
+ if (res == 0)
+ return NULL;
- res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
- if (res == 1 && val == 'y')
- return true;
+ fh = kzalloc(res, GFP_TEMPORARY);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res);
+ if (res < 0)
+ goto fail;
+
+ if (res < sizeof(struct ovl_fh) || res < fh->len)
+ goto invalid;
+
+ if (fh->magic != OVL_FH_MAGIC)
+ goto invalid;
+
+ /* Treat larger version and unknown flags as "origin unknown" */
+ if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL)
+ goto out;
+
+ /* Treat endianness mismatch as "origin unknown" */
+ if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
+ (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
+ goto out;
+
+ bytes = (fh->len - offsetof(struct ovl_fh, fid));
+
+ /*
+ * Make sure that the stored uuid matches the uuid of the lower
+ * layer where file handle will be decoded.
+ */
+ if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid))
+ goto out;
- return false;
+ origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid,
+ bytes >> 2, (int)fh->type,
+ ovl_acceptable, NULL);
+ if (IS_ERR(origin)) {
+ /* Treat stale file handle as "origin unknown" */
+ if (origin == ERR_PTR(-ESTALE))
+ origin = NULL;
+ goto out;
+ }
+
+ if (ovl_dentry_weird(origin) ||
+ ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) {
+ dput(origin);
+ origin = NULL;
+ goto invalid;
+ }
+
+out:
+ kfree(fh);
+ return origin;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res);
+ goto out;
+invalid:
+ pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh);
+ goto out;
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+ return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE);
}
static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
@@ -192,6 +268,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
return 0;
}
+
+static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry,
+ struct path **stackp, unsigned int *ctrp)
+{
+ struct super_block *same_sb = ovl_same_sb(dentry->d_sb);
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
+ struct vfsmount *mnt;
+ struct dentry *origin;
+
+ if (!same_sb || !roe->numlower)
+ return 0;
+
+ /*
+ * Since all layers are on the same fs, we use the first layer for
+ * decoding the file handle. We may get a disconnected dentry,
+ * which is fine, because we only need to hold the origin inode in
+ * cache and use its inode number. We may even get a connected dentry,
+ * that is not under the first layer's root. That is also fine for
+ * using it's inode number - it's the same as if we held a reference
+ * to a dentry in first layer that was moved under us.
+ */
+ mnt = roe->lowerstack[0].mnt;
+
+ origin = ovl_get_origin(upperdentry, mnt);
+ if (IS_ERR_OR_NULL(origin))
+ return PTR_ERR(origin);
+
+ BUG_ON(*stackp || *ctrp);
+ *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY);
+ if (!*stackp) {
+ dput(origin);
+ return -ENOMEM;
+ }
+ **stackp = (struct path) { .dentry = origin, .mnt = mnt };
+ *ctrp = 1;
+
+ return 0;
+}
+
/*
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
@@ -220,11 +335,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
const struct cred *old_cred;
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
struct path *stack = NULL;
struct dentry *upperdir, *upperdentry = NULL;
unsigned int ctr = 0;
struct inode *inode = NULL;
bool upperopaque = false;
+ bool upperimpure = false;
char *upperredirect = NULL;
struct dentry *this;
unsigned int i;
@@ -253,15 +370,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = -EREMOTE;
goto out;
}
+ if (upperdentry && !d.is_dir) {
+ BUG_ON(!d.stop || d.redirect);
+ err = ovl_check_origin(dentry, upperdentry,
+ &stack, &ctr);
+ if (err)
+ goto out;
+ }
if (d.redirect) {
upperredirect = kstrdup(d.redirect, GFP_KERNEL);
if (!upperredirect)
goto out_put_upper;
if (d.redirect[0] == '/')
- poe = dentry->d_sb->s_root->d_fsdata;
+ poe = roe;
}
upperopaque = d.opaque;
+ if (upperdentry && d.is_dir)
+ upperimpure = ovl_is_impuredir(upperdentry);
}
if (!d.stop && poe->numlower) {
@@ -290,10 +416,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.stop)
break;
- if (d.redirect &&
- d.redirect[0] == '/' &&
- poe != dentry->d_sb->s_root->d_fsdata) {
- poe = dentry->d_sb->s_root->d_fsdata;
+ if (d.redirect && d.redirect[0] == '/' && poe != roe) {
+ poe = roe;
/* Find the current layer on the root dentry */
for (i = 0; i < poe->numlower; i++)
@@ -332,6 +456,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
revert_creds(old_cred);
oe->opaque = upperopaque;
+ oe->impure = upperimpure;
oe->redirect = upperredirect;
oe->__upperdentry = upperdentry;
memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 741dc0b6931f..10863b4105fa 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,18 +8,57 @@
*/
#include <linux/kernel.h>
+#include <linux/uuid.h>
enum ovl_path_type {
__OVL_PATH_UPPER = (1 << 0),
__OVL_PATH_MERGE = (1 << 1),
+ __OVL_PATH_ORIGIN = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
+#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
+#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure"
+
+/*
+ * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
+ * where:
+ * origin.fh - exported file handle of the lower file
+ * origin.uuid - uuid of the lower filesystem
+ */
+#define OVL_FH_VERSION 0
+#define OVL_FH_MAGIC 0xfb
+
+/* CPU byte order required for fid decoding: */
+#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
+#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
+
+#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN)
+
+#if defined(__LITTLE_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN 0
+#elif defined(__BIG_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
+#else
+#error Endianness not defined
+#endif
+
+/* On-disk and in-memeory format for redirect by file handle */
+struct ovl_fh {
+ u8 version; /* 0 */
+ u8 magic; /* 0xfb */
+ u8 len; /* size of this header + size of fid */
+ u8 flags; /* OVL_FH_FLAG_* */
+ u8 type; /* fid_type of fid */
+ uuid_t uuid; /* uuid of filesystem */
+ u8 fid[0]; /* file identifier */
+} __packed;
#define OVL_ISUPPER_MASK 1UL
@@ -151,6 +190,7 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+struct super_block *ovl_same_sb(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
bool ovl_dentry_weird(struct dentry *dentry);
@@ -164,10 +204,10 @@ struct dentry *ovl_dentry_real(struct dentry *dentry);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
bool ovl_dentry_is_opaque(struct dentry *dentry);
+bool ovl_dentry_is_impure(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
bool ovl_redirect_dir(struct super_block *sb);
-void ovl_clear_redirect_dir(struct super_block *sb);
const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
@@ -180,6 +220,17 @@ bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry);
void ovl_copy_up_end(struct dentry *dentry);
+bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
+int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
+ const char *name, const void *value, size_t size,
+ int xerr);
+int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
+
+static inline bool ovl_is_impuredir(struct dentry *dentry)
+{
+ return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
+}
+
/* namei.c */
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
@@ -197,6 +248,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
int ovl_permission(struct inode *inode, int mask);
int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags);
@@ -222,7 +275,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
-struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct dentry *workdir);
struct cattr {
dev_t rdev;
umode_t mode;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 59614faa14c3..34bc4a9f5c61 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -28,7 +28,10 @@ struct ovl_fs {
/* creds of process who forced instantiation of super block */
const struct cred *creator_cred;
bool tmpfile;
+ bool noxattr;
wait_queue_head_t copyup_wq;
+ /* sb common to all layers */
+ struct super_block *same_sb;
};
/* private information held for every overlayfs dentry */
@@ -40,6 +43,7 @@ struct ovl_entry {
u64 version;
const char *redirect;
bool opaque;
+ bool impure;
bool copying;
};
struct rcu_head rcu;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c9e70d39c1ea..4882ffb37bae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static int ovl_check_append_only(struct inode *inode, int flag)
+{
+ /*
+ * This test was moot in vfs may_open() because overlay inode does
+ * not have the S_APPEND flag, so re-check on real upper inode
+ */
+ if (IS_APPEND(inode)) {
+ if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+ return -EPERM;
+ if (flag & O_TRUNC)
+ return -EPERM;
+ }
+
+ return 0;
+}
+
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode,
unsigned int open_flags)
{
struct dentry *real;
+ int err;
if (!d_is_reg(dentry)) {
if (!inode || inode == d_inode(dentry))
@@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
return dentry;
if (open_flags) {
- int err = ovl_open_maybe_copy_up(dentry, open_flags);
-
+ err = ovl_open_maybe_copy_up(dentry, open_flags);
if (err)
return ERR_PTR(err);
}
real = ovl_dentry_upper(dentry);
- if (real && (!inode || inode == d_inode(real)))
+ if (real && (!inode || inode == d_inode(real))) {
+ if (!inode) {
+ err = ovl_check_append_only(d_inode(real), open_flags);
+ if (err)
+ return ERR_PTR(err);
+ }
return real;
+ }
real = ovl_dentry_lower(dentry);
if (!real)
@@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct path upperpath = { NULL, NULL };
- struct path workpath = { NULL, NULL };
+ struct path upperpath = { };
+ struct path workpath = { };
struct dentry *root_dentry;
struct inode *realinode;
struct ovl_entry *oe;
@@ -869,6 +891,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
dput(temp);
else
pr_warn("overlayfs: upper fs does not support tmpfile.\n");
+
+ /*
+ * Check if upper/work fs supports trusted.overlay.*
+ * xattr
+ */
+ err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE,
+ "0", 1, 0);
+ if (err) {
+ ufs->noxattr = true;
+ pr_warn("overlayfs: upper fs does not support xattr.\n");
+ } else {
+ vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE);
+ }
}
}
@@ -892,11 +927,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ufs->lower_mnt[ufs->numlower] = mnt;
ufs->numlower++;
+
+ /* Check if all lower layers are on same sb */
+ if (i == 0)
+ ufs->same_sb = mnt->mnt_sb;
+ else if (ufs->same_sb != mnt->mnt_sb)
+ ufs->same_sb = NULL;
}
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
+ else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
+ ufs->same_sb = NULL;
if (remote)
sb->s_d_op = &ovl_reval_dentry_operations;
@@ -931,7 +974,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
path_put(&workpath);
kfree(lowertmp);
- oe->__upperdentry = upperpath.dentry;
+ if (upperpath.dentry) {
+ oe->__upperdentry = upperpath.dentry;
+ oe->impure = ovl_is_impuredir(upperpath.dentry);
+ }
for (i = 0; i < numlower; i++) {
oe->lowerstack[i].dentry = stack[i].dentry;
oe->lowerstack[i].mnt = ufs->lower_mnt[i];
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6e610a205e15..809048913889 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb)
return override_creds(ofs->creator_cred);
}
+struct super_block *ovl_same_sb(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->same_sb;
+}
+
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
@@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
type = __OVL_PATH_UPPER;
/*
- * Non-dir dentry can hold lower dentry from previous
- * location.
+ * Non-dir dentry can hold lower dentry of its copy up origin.
*/
- if (oe->numlower && d_is_dir(dentry))
- type |= __OVL_PATH_MERGE;
+ if (oe->numlower) {
+ type |= __OVL_PATH_ORIGIN;
+ if (d_is_dir(dentry))
+ type |= __OVL_PATH_MERGE;
+ }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
{
struct ovl_entry *oe = dentry->d_fsdata;
- *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
}
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
@@ -166,6 +175,13 @@ bool ovl_dentry_is_opaque(struct dentry *dentry)
return oe->opaque;
}
+bool ovl_dentry_is_impure(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->impure;
+}
+
bool ovl_dentry_is_whiteout(struct dentry *dentry)
{
return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
@@ -182,14 +198,7 @@ bool ovl_redirect_dir(struct super_block *sb)
{
struct ovl_fs *ofs = sb->s_fs_info;
- return ofs->config.redirect_dir;
-}
-
-void ovl_clear_redirect_dir(struct super_block *sb)
-{
- struct ovl_fs *ofs = sb->s_fs_info;
-
- ofs->config.redirect_dir = false;
+ return ofs->config.redirect_dir && !ofs->noxattr;
}
const char *ovl_dentry_get_redirect(struct dentry *dentry)
@@ -294,3 +303,59 @@ void ovl_copy_up_end(struct dentry *dentry)
wake_up_locked(&ofs->copyup_wq);
spin_unlock(&ofs->copyup_wq.lock);
}
+
+bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
+{
+ int res;
+ char val;
+
+ if (!d_is_dir(dentry))
+ return false;
+
+ res = vfs_getxattr(dentry, name, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
+ const char *name, const void *value, size_t size,
+ int xerr)
+{
+ int err;
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+
+ if (ofs->noxattr)
+ return xerr;
+
+ err = ovl_do_setxattr(upperdentry, name, value, size, 0);
+
+ if (err == -EOPNOTSUPP) {
+ pr_warn("overlayfs: cannot set %s xattr on upper\n", name);
+ ofs->noxattr = true;
+ return xerr;
+ }
+
+ return err;
+}
+
+int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
+{
+ int err;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe->impure)
+ return 0;
+
+ /*
+ * Do not fail when upper doesn't support xattrs.
+ * Upper inodes won't have origin nor redirect xattr anyway.
+ */
+ err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE,
+ "y", 1, 0);
+ if (!err)
+ oe->impure = true;
+
+ return err;
+}
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896d122a..53d411a371ce 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}
+static inline struct mount *last_slave(struct mount *p)
+{
+ return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
+}
+
static inline struct mount *next_slave(struct mount *p)
{
return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
}
}
+static struct mount *skip_propagation_subtree(struct mount *m,
+ struct mount *origin)
+{
+ /*
+ * Advance m such that propagation_next will not return
+ * the slaves of m.
+ */
+ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ m = last_slave(m);
+
+ return m;
+}
+
static struct mount *next_group(struct mount *m, struct mount *origin)
{
while (1) {
@@ -413,65 +431,104 @@ void propagate_mount_unlock(struct mount *mnt)
}
}
-/*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
- */
-static void mark_umount_candidates(struct mount *mnt)
+static void umount_one(struct mount *mnt, struct list_head *to_umount)
{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
-
- BUG_ON(parent == mnt);
-
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
- continue;
- if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
- SET_MNT_MARK(child);
- }
- }
+ CLEAR_MNT_MARK(mnt);
+ mnt->mnt.mnt_flags |= MNT_UMOUNT;
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_umounting);
+ list_move_tail(&mnt->mnt_list, to_umount);
}
/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
-static void __propagate_umount(struct mount *mnt)
+static bool __propagate_umount(struct mount *mnt,
+ struct list_head *to_umount,
+ struct list_head *to_restore)
{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
+ bool progress = false;
+ struct mount *child;
- BUG_ON(parent == mnt);
+ /*
+ * The state of the parent won't change if this mount is
+ * already unmounted or marked as without children.
+ */
+ if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
+ goto out;
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *topper;
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- /*
- * umount the child only if the child has no children
- * and the child is marked safe to unmount.
- */
- if (!child || !IS_MNT_MARKED(child))
+ /* Verify topper is the only grandchild that has not been
+ * speculatively unmounted.
+ */
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (child->mnt_mountpoint == mnt->mnt.mnt_root)
continue;
- CLEAR_MNT_MARK(child);
+ if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
+ continue;
+ /* Found a mounted child */
+ goto children;
+ }
- /* If there is exactly one mount covering all of child
- * replace child with that mount.
- */
- topper = find_topper(child);
- if (topper)
- mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
- topper);
+ /* Mark mounts that can be unmounted if not locked */
+ SET_MNT_MARK(mnt);
+ progress = true;
+
+ /* If a mount is without children and not locked umount it. */
+ if (!IS_MNT_LOCKED(mnt)) {
+ umount_one(mnt, to_umount);
+ } else {
+children:
+ list_move_tail(&mnt->mnt_umounting, to_restore);
+ }
+out:
+ return progress;
+}
+
+static void umount_list(struct list_head *to_umount,
+ struct list_head *to_restore)
+{
+ struct mount *mnt, *child, *tmp;
+ list_for_each_entry(mnt, to_umount, mnt_list) {
+ list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
+ /* topper? */
+ if (child->mnt_mountpoint == mnt->mnt.mnt_root)
+ list_move_tail(&child->mnt_umounting, to_restore);
+ else
+ umount_one(child, to_umount);
+ }
+ }
+}
- if (list_empty(&child->mnt_mounts)) {
- list_del_init(&child->mnt_child);
- child->mnt.mnt_flags |= MNT_UMOUNT;
- list_move_tail(&child->mnt_list, &mnt->mnt_list);
+static void restore_mounts(struct list_head *to_restore)
+{
+ /* Restore mounts to a clean working state */
+ while (!list_empty(to_restore)) {
+ struct mount *mnt, *parent;
+ struct mountpoint *mp;
+
+ mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
+ CLEAR_MNT_MARK(mnt);
+ list_del_init(&mnt->mnt_umounting);
+
+ /* Should this mount be reparented? */
+ mp = mnt->mnt_mp;
+ parent = mnt->mnt_parent;
+ while (parent->mnt.mnt_flags & MNT_UMOUNT) {
+ mp = parent->mnt_mp;
+ parent = parent->mnt_parent;
}
+ if (parent != mnt->mnt_parent)
+ mnt_change_mountpoint(parent, mp, mnt);
+ }
+}
+
+static void cleanup_umount_visitations(struct list_head *visited)
+{
+ while (!list_empty(visited)) {
+ struct mount *mnt =
+ list_first_entry(visited, struct mount, mnt_umounting);
+ list_del_init(&mnt->mnt_umounting);
}
}
@@ -485,11 +542,68 @@ static void __propagate_umount(struct mount *mnt)
int propagate_umount(struct list_head *list)
{
struct mount *mnt;
+ LIST_HEAD(to_restore);
+ LIST_HEAD(to_umount);
+ LIST_HEAD(visited);
+
+ /* Find candidates for unmounting */
+ list_for_each_entry_reverse(mnt, list, mnt_list) {
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
+
+ /*
+ * If this mount has already been visited it is known that it's
+ * entire peer group and all of their slaves in the propagation
+ * tree for the mountpoint has already been visited and there is
+ * no need to visit them again.
+ */
+ if (!list_empty(&mnt->mnt_umounting))
+ continue;
+
+ list_add_tail(&mnt->mnt_umounting, &visited);
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ struct mount *child = __lookup_mnt(&m->mnt,
+ mnt->mnt_mountpoint);
+ if (!child)
+ continue;
+
+ if (!list_empty(&child->mnt_umounting)) {
+ /*
+ * If the child has already been visited it is
+ * know that it's entire peer group and all of
+ * their slaves in the propgation tree for the
+ * mountpoint has already been visited and there
+ * is no need to visit this subtree again.
+ */
+ m = skip_propagation_subtree(m, parent);
+ continue;
+ } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
+ /*
+ * We have come accross an partially unmounted
+ * mount in list that has not been visited yet.
+ * Remember it has been visited and continue
+ * about our merry way.
+ */
+ list_add_tail(&child->mnt_umounting, &visited);
+ continue;
+ }
+
+ /* Check the child and parents while progress is made */
+ while (__propagate_umount(child,
+ &to_umount, &to_restore)) {
+ /* Is the parent a umount candidate? */
+ child = child->mnt_parent;
+ if (list_empty(&child->mnt_umounting))
+ break;
+ }
+ }
+ }
- list_for_each_entry_reverse(mnt, list, mnt_list)
- mark_umount_candidates(mnt);
+ umount_list(&to_umount, &to_restore);
+ restore_mounts(&to_restore);
+ cleanup_umount_visitations(&visited);
+ list_splice_tail(&to_umount, list);
- list_for_each_entry(mnt, list, mnt_list)
- __propagate_umount(mnt);
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e3ac5c11780..f1e1927ccd48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- /* Maybe we should limit FOLL_FORCE to actual ptrace users? */
- flags = FOLL_FORCE;
- if (write)
- flags |= FOLL_WRITE;
+ flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
while (count > 0) {
int this_len = min_t(int, count, PAGE_SIZE);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
struct proc_inode *ei;
struct inode *inode;
- ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4ee55274f155..45629f4b5402 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -504,7 +504,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (&m->list == &kclist_head) {
if (clear_user(buffer, tsz))
return -EFAULT;
- } else if (is_vmalloc_or_module_addr((void *)start)) {
+ } else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, buf, tsz))
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
#endif
#ifdef CONFIG_PID_NS
&pidns_operations,
+ &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
&userns_operations,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0c8b33d99b1..520802da059c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -300,11 +300,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
/* We don't show the stack guard page in /proc/maps */
start = vma->vm_start;
- if (stack_guard_page_start(vma, start))
- start += PAGE_SIZE;
end = vma->vm_end;
- if (stack_guard_page_end(vma, end))
- end -= PAGE_SIZE;
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 792a4e5f9226..4d02c3b65061 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -349,48 +349,48 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
switch (record->type) {
case PSTORE_TYPE_DMESG:
- scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
+ scnprintf(name, sizeof(name), "dmesg-%s-%llu%s",
record->psi->name, record->id,
record->compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- scnprintf(name, sizeof(name), "console-%s-%lld",
+ scnprintf(name, sizeof(name), "console-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_FTRACE:
- scnprintf(name, sizeof(name), "ftrace-%s-%lld",
+ scnprintf(name, sizeof(name), "ftrace-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_MCE:
- scnprintf(name, sizeof(name), "mce-%s-%lld",
+ scnprintf(name, sizeof(name), "mce-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_RTAS:
- scnprintf(name, sizeof(name), "rtas-%s-%lld",
+ scnprintf(name, sizeof(name), "rtas-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OF:
- scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+ scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_COMMON:
- scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+ scnprintf(name, sizeof(name), "powerpc-common-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_PMSG:
- scnprintf(name, sizeof(name), "pmsg-%s-%lld",
+ scnprintf(name, sizeof(name), "pmsg-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OPAL:
- scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld",
+ scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu",
record->psi->name, record->id);
break;
case PSTORE_TYPE_UNKNOWN:
- scnprintf(name, sizeof(name), "unknown-%s-%lld",
+ scnprintf(name, sizeof(name), "unknown-%s-%llu",
record->psi->name, record->id);
break;
default:
- scnprintf(name, sizeof(name), "type%d-%s-%lld",
+ scnprintf(name, sizeof(name), "type%d-%s-%llu",
record->type, record->psi->name, record->id);
break;
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c416e653dc4f..58051265626f 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -30,5 +30,7 @@ extern void pstore_get_backend_records(struct pstore_info *psi,
extern int pstore_mkfile(struct dentry *root,
struct pstore_record *record);
extern bool pstore_is_mounted(void);
+extern void pstore_record_init(struct pstore_record *record,
+ struct pstore_info *psi);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d468eec9b8a6..1b6e0ff6bff5 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -474,6 +474,20 @@ static size_t copy_kmsg_to_buffer(int hsize, size_t len)
return total_len;
}
+void pstore_record_init(struct pstore_record *record,
+ struct pstore_info *psinfo)
+{
+ memset(record, 0, sizeof(*record));
+
+ record->psi = psinfo;
+
+ /* Report zeroed timestamp if called before timekeeping has resumed. */
+ if (__getnstimeofday(&record->time)) {
+ record->time.tv_sec = 0;
+ record->time.tv_nsec = 0;
+ }
+}
+
/*
* callback from kmsg_dump. (s2,l2) has the most recently
* written bytes, older bytes are in (s1,l1). Save as much
@@ -509,15 +523,14 @@ static void pstore_dump(struct kmsg_dumper *dumper,
int header_size;
int zipped_len = -1;
size_t dump_size;
- struct pstore_record record = {
- .type = PSTORE_TYPE_DMESG,
- .count = oopscount,
- .reason = reason,
- .part = part,
- .compressed = false,
- .buf = psinfo->buf,
- .psi = psinfo,
- };
+ struct pstore_record record;
+
+ pstore_record_init(&record, psinfo);
+ record.type = PSTORE_TYPE_DMESG;
+ record.count = oopscount;
+ record.reason = reason;
+ record.part = part;
+ record.buf = psinfo->buf;
if (big_oops_buf && is_locked) {
dst = big_oops_buf;
@@ -587,12 +600,12 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
const char *e = s + c;
while (s < e) {
- struct pstore_record record = {
- .type = PSTORE_TYPE_CONSOLE,
- .psi = psinfo,
- };
+ struct pstore_record record;
unsigned long flags;
+ pstore_record_init(&record, psinfo);
+ record.type = PSTORE_TYPE_CONSOLE;
+
if (c > psinfo->bufsize)
c = psinfo->bufsize;
@@ -640,19 +653,16 @@ static int pstore_write_user_compat(struct pstore_record *record,
if (record->buf)
return -EINVAL;
- record->buf = kmalloc(record->size, GFP_KERNEL);
- if (!record->buf)
- return -ENOMEM;
-
- if (unlikely(copy_from_user(record->buf, buf, record->size))) {
- ret = -EFAULT;
+ record->buf = memdup_user(buf, record->size);
+ if (unlikely(IS_ERR(record->buf))) {
+ ret = PTR_ERR(record->buf);
goto out;
}
ret = record->psi->write(record);
-out:
kfree(record->buf);
+out:
record->buf = NULL;
return unlikely(ret < 0) ? ret : record->size;
@@ -770,8 +780,11 @@ static void decompress_record(struct pstore_record *record)
int unzipped_len;
char *decompressed;
+ if (!record->compressed)
+ return;
+
/* Only PSTORE_TYPE_DMESG support compression. */
- if (!record->compressed || record->type != PSTORE_TYPE_DMESG) {
+ if (record->type != PSTORE_TYPE_DMESG) {
pr_warn("ignored compressed record type %d\n", record->type);
return;
}
@@ -819,6 +832,7 @@ void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet)
{
int failed = 0;
+ unsigned int stop_loop = 65536;
if (!psi || !root)
return;
@@ -832,7 +846,7 @@ void pstore_get_backend_records(struct pstore_info *psi,
* may reallocate record.buf. On success, pstore_mkfile() will keep
* the record.buf, so free it only on failure.
*/
- for (;;) {
+ for (; stop_loop; stop_loop--) {
struct pstore_record *record;
int rc;
@@ -841,13 +855,15 @@ void pstore_get_backend_records(struct pstore_info *psi,
pr_err("out of memory creating record\n");
break;
}
- record->psi = psi;
+ pstore_record_init(record, psi);
record->size = psi->read(record);
/* No more records left in backend? */
- if (record->size <= 0)
+ if (record->size <= 0) {
+ kfree(record);
break;
+ }
decompress_record(record);
rc = pstore_mkfile(root, record);
@@ -865,8 +881,11 @@ out:
mutex_unlock(&psi->read_mutex);
if (failed)
- pr_warn("failed to load %d record(s) from '%s'\n",
+ pr_warn("failed to create %d record(s) from '%s'\n",
failed, psi->name);
+ if (!stop_loop)
+ pr_err("looping? Too many records seen from '%s'\n",
+ psi->name);
}
static void pstore_dowork(struct work_struct *work)
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 209755e0d7c8..24db02de1787 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -22,16 +22,16 @@ static DEFINE_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct pstore_record record = {
- .type = PSTORE_TYPE_PMSG,
- .size = count,
- .psi = psinfo,
- };
+ struct pstore_record record;
int ret;
if (!count)
return 0;
+ pstore_record_init(&record, psinfo);
+ record.type = PSTORE_TYPE_PMSG;
+ record.size = count;
+
/* check outside lock, page in any data. write_user also checks */
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5523df7f17ef..7125b398d312 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -27,7 +27,6 @@
#include <linux/module.h>
#include <linux/version.h>
#include <linux/pstore.h>
-#include <linux/time.h>
#include <linux/io.h>
#include <linux/ioport.h>
#include <linux/platform_device.h>
@@ -58,7 +57,7 @@ module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
MODULE_PARM_DESC(pmsg_size, "size of user space message log");
static unsigned long long mem_address;
-module_param(mem_address, ullong, 0400);
+module_param_hw(mem_address, ullong, other, 0400);
MODULE_PARM_DESC(mem_address,
"start of reserved RAM used to store oops/panic logs");
@@ -356,20 +355,15 @@ out:
}
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
- bool compressed)
+ struct pstore_record *record)
{
char *hdr;
- struct timespec timestamp;
size_t len;
- /* Report zeroed timestamp if called before timekeeping has resumed. */
- if (__getnstimeofday(&timestamp)) {
- timestamp.tv_sec = 0;
- timestamp.tv_nsec = 0;
- }
hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
- (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000),
- compressed ? 'C' : 'D');
+ record->time.tv_sec,
+ record->time.tv_nsec / 1000,
+ record->compressed ? 'C' : 'D');
WARN_ON_ONCE(!hdr);
len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
@@ -440,7 +434,7 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
prz = cxt->dprzs[cxt->dump_write_cnt];
/* Build header and append record contents. */
- hlen = ramoops_write_kmsg_hdr(prz, record->compressed);
+ hlen = ramoops_write_kmsg_hdr(prz, record);
size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ebf80c7739e1..53a17496c5c5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1512,6 +1512,22 @@ int dquot_initialize(struct inode *inode)
}
EXPORT_SYMBOL(dquot_initialize);
+bool dquot_initialize_needed(struct inode *inode)
+{
+ struct dquot **dquots;
+ int i;
+
+ if (!dquot_active(inode))
+ return false;
+
+ dquots = i_dquot(inode);
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL(dquot_initialize_needed);
+
/*
* Release all quotas referenced by inode.
*
@@ -1894,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
qsize_t space, cur_space;
qsize_t rsv_space = 0;
+ qsize_t inode_usage = 1;
struct dquot *transfer_from[MAXQUOTAS] = {};
int cnt, ret = 0;
char is_valid[MAXQUOTAS] = {};
@@ -1903,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (IS_NOQUOTA(inode))
return 0;
+
+ if (inode->i_sb->dq_op->get_inode_usage) {
+ ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
+ if (ret)
+ return ret;
+ }
+
/* Initialize the arrays */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
warn_to[cnt].w_type = QUOTA_NL_NOWARN;
@@ -1930,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
continue;
is_valid[cnt] = 1;
transfer_from[cnt] = i_dquot(inode)[cnt];
- ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
+ ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
if (ret)
goto over_quota;
ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
@@ -1947,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
/* Due to IO error we might not have transfer_from[] structure */
if (transfer_from[cnt]) {
int wtype;
- wtype = info_idq_free(transfer_from[cnt], 1);
+ wtype = info_idq_free(transfer_from[cnt], inode_usage);
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn_from_inodes[cnt],
transfer_from[cnt], wtype);
@@ -1955,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn_from_space[cnt],
transfer_from[cnt], wtype);
- dquot_decr_inodes(transfer_from[cnt], 1);
+ dquot_decr_inodes(transfer_from[cnt], inode_usage);
dquot_decr_space(transfer_from[cnt], cur_space);
dquot_free_reserved_space(transfer_from[cnt],
rsv_space);
}
- dquot_incr_inodes(transfer_to[cnt], 1);
+ dquot_incr_inodes(transfer_to[cnt], inode_usage);
dquot_incr_space(transfer_to[cnt], cur_space);
dquot_resv_space(transfer_to[cnt], rsv_space);
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d4484df9..0cc7033aa413 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -356,46 +356,6 @@ out_putf:
}
#endif
-ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- if (!file->f_op->read_iter)
- return -EINVAL;
-
- init_sync_kiocb(&kiocb, file);
- kiocb.ki_pos = *ppos;
-
- iter->type |= READ;
- ret = call_read_iter(file, &kiocb, iter);
- BUG_ON(ret == -EIOCBQUEUED);
- if (ret > 0)
- *ppos = kiocb.ki_pos;
- return ret;
-}
-EXPORT_SYMBOL(vfs_iter_read);
-
-ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- if (!file->f_op->write_iter)
- return -EINVAL;
-
- init_sync_kiocb(&kiocb, file);
- kiocb.ki_pos = *ppos;
-
- iter->type |= WRITE;
- ret = call_write_iter(file, &kiocb, iter);
- BUG_ON(ret == -EIOCBQUEUED);
- if (ret > 0)
- *ppos = kiocb.ki_pos;
- return ret;
-}
-EXPORT_SYMBOL(vfs_iter_write);
-
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
struct inode *inode;
@@ -678,16 +638,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
struct kiocb kiocb;
ssize_t ret;
- if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
- return -EOPNOTSUPP;
-
init_sync_kiocb(&kiocb, filp);
- if (flags & RWF_HIPRI)
- kiocb.ki_flags |= IOCB_HIPRI;
- if (flags & RWF_DSYNC)
- kiocb.ki_flags |= IOCB_DSYNC;
- if (flags & RWF_SYNC)
- kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+ ret = kiocb_set_rw_flags(&kiocb, flags);
+ if (ret)
+ return ret;
kiocb.ki_pos = *ppos;
if (type == READ)
@@ -916,86 +870,114 @@ out:
}
#endif
-static ssize_t __do_readv_writev(int type, struct file *file,
- struct iov_iter *iter, loff_t *pos, int flags)
+static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
+ loff_t *pos, int flags)
{
size_t tot_len;
ssize_t ret = 0;
+ if (!(file->f_mode & FMODE_READ))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
tot_len = iov_iter_count(iter);
if (!tot_len)
goto out;
- ret = rw_verify_area(type, file, pos, tot_len);
+ ret = rw_verify_area(READ, file, pos, tot_len);
if (ret < 0)
- goto out;
-
- if (type != READ)
- file_start_write(file);
+ return ret;
- if ((type == READ && file->f_op->read_iter) ||
- (type == WRITE && file->f_op->write_iter))
- ret = do_iter_readv_writev(file, iter, pos, type, flags);
+ if (file->f_op->read_iter)
+ ret = do_iter_readv_writev(file, iter, pos, READ, flags);
else
- ret = do_loop_readv_writev(file, iter, pos, type, flags);
-
- if (type != READ)
- file_end_write(file);
-
+ ret = do_loop_readv_writev(file, iter, pos, READ, flags);
out:
- if ((ret + (type == READ)) > 0) {
- if (type == READ)
- fsnotify_access(file);
- else
- fsnotify_modify(file);
- }
+ if (ret >= 0)
+ fsnotify_access(file);
return ret;
}
-static ssize_t do_readv_writev(int type, struct file *file,
- const struct iovec __user *uvector,
- unsigned long nr_segs, loff_t *pos,
- int flags)
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ int flags)
{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
+ if (!file->f_op->read_iter)
+ return -EINVAL;
+ return do_iter_read(file, iter, ppos, flags);
+}
+EXPORT_SYMBOL(vfs_iter_read);
+
+static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
+ loff_t *pos, int flags)
+{
+ size_t tot_len;
+ ssize_t ret = 0;
- ret = import_iovec(type, uvector, nr_segs,
- ARRAY_SIZE(iovstack), &iov, &iter);
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return -EINVAL;
+
+ tot_len = iov_iter_count(iter);
+ if (!tot_len)
+ return 0;
+ ret = rw_verify_area(WRITE, file, pos, tot_len);
if (ret < 0)
return ret;
- ret = __do_readv_writev(type, file, &iter, pos, flags);
- kfree(iov);
-
+ if (file->f_op->write_iter)
+ ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
+ else
+ ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
+ if (ret > 0)
+ fsnotify_modify(file);
return ret;
}
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ int flags)
+{
+ if (!file->f_op->write_iter)
+ return -EINVAL;
+ return do_iter_write(file, iter, ppos, flags);
+}
+EXPORT_SYMBOL(vfs_iter_write);
+
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, int flags)
{
- if (!(file->f_mode & FMODE_READ))
- return -EBADF;
- if (!(file->f_mode & FMODE_CAN_READ))
- return -EINVAL;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t ret;
- return do_readv_writev(READ, file, vec, vlen, pos, flags);
-}
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret >= 0) {
+ ret = do_iter_read(file, &iter, pos, flags);
+ kfree(iov);
+ }
+ return ret;
+}
EXPORT_SYMBOL(vfs_readv);
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, int flags)
{
- if (!(file->f_mode & FMODE_WRITE))
- return -EBADF;
- if (!(file->f_mode & FMODE_CAN_WRITE))
- return -EINVAL;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t ret;
- return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
+ ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret >= 0) {
+ file_start_write(file);
+ ret = do_iter_write(file, &iter, pos, flags);
+ file_end_write(file);
+ kfree(iov);
+ }
+ return ret;
}
-
EXPORT_SYMBOL(vfs_writev);
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
@@ -1143,44 +1125,20 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
}
#ifdef CONFIG_COMPAT
-
-static ssize_t compat_do_readv_writev(int type, struct file *file,
- const struct compat_iovec __user *uvector,
- unsigned long nr_segs, loff_t *pos,
- int flags)
+static size_t compat_readv(struct file *file,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t *pos, int flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
- ret = compat_import_iovec(type, uvector, nr_segs,
- UIO_FASTIOV, &iov, &iter);
- if (ret < 0)
- return ret;
-
- ret = __do_readv_writev(type, file, &iter, pos, flags);
- kfree(iov);
-
- return ret;
-}
-
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, int flags)
-{
- ssize_t ret = -EBADF;
-
- if (!(file->f_mode & FMODE_READ))
- goto out;
-
- ret = -EINVAL;
- if (!(file->f_mode & FMODE_CAN_READ))
- goto out;
-
- ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
-
-out:
+ ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
+ if (ret >= 0) {
+ ret = do_iter_read(file, &iter, pos, flags);
+ kfree(iov);
+ }
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
@@ -1276,18 +1234,18 @@ static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
unsigned long vlen, loff_t *pos, int flags)
{
- ssize_t ret = -EBADF;
-
- if (!(file->f_mode & FMODE_WRITE))
- goto out;
-
- ret = -EINVAL;
- if (!(file->f_mode & FMODE_CAN_WRITE))
- goto out;
-
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t ret;
-out:
+ ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
+ if (ret >= 0) {
+ file_start_write(file);
+ ret = do_iter_write(file, &iter, pos, flags);
+ file_end_write(file);
+ kfree(iov);
+ }
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index aca73dd73906..e3c558d1b78c 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi)
}
static struct item_operations errcatch_ops = {
- errcatch_bytes_number,
- errcatch_decrement_key,
- errcatch_is_left_mergeable,
- errcatch_print_item,
- errcatch_check_item,
-
- errcatch_create_vi,
- errcatch_check_left,
- errcatch_check_right,
- errcatch_part_size,
- errcatch_unit_num,
- errcatch_print_vi
+ .bytes_number = errcatch_bytes_number,
+ .decrement_key = errcatch_decrement_key,
+ .is_left_mergeable = errcatch_is_left_mergeable,
+ .print_item = errcatch_print_item,
+ .check_item = errcatch_check_item,
+
+ .create_vi = errcatch_create_vi,
+ .check_left = errcatch_check_left,
+ .check_right = errcatch_check_right,
+ .part_size = errcatch_part_size,
+ .unit_num = errcatch_unit_num,
+ .print_vi = errcatch_print_vi
};
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index da01f497180a..a11d773e5ff3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1112,7 +1112,7 @@ static int flush_commit_list(struct super_block *s,
depth = reiserfs_write_unlock_nested(s);
if (reiserfs_barrier_flush(s))
__sync_dirty_buffer(jl->j_commit_bh,
- REQ_PREFLUSH | REQ_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(jl->j_commit_bh);
reiserfs_write_lock_nested(s, depth);
@@ -1271,7 +1271,7 @@ static int _update_journal_header_block(struct super_block *sb,
if (reiserfs_barrier_flush(sb))
__sync_dirty_buffer(journal->j_header_bh,
- REQ_PREFLUSH | REQ_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(journal->j_header_bh);
@@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s)
static void queue_log_writer(struct super_block *s)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct reiserfs_journal *journal = SB_JOURNAL(s);
set_bit(J_WRITERS_QUEUED, &journal->j_state);
diff --git a/fs/select.c b/fs/select.c
index bd4b2ccfd346..9d5f15ed87fe 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
return table->entry++;
}
-static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
return default_wake_function(&dummy_wait, mode, sync, key);
}
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
@@ -633,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
- if (!bits && alloc_size > PAGE_SIZE)
- bits = vmalloc(alloc_size);
-
+ bits = kvmalloc(alloc_size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
@@ -1164,59 +1161,25 @@ static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
unsigned long *fdset)
{
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
if (ufdset) {
- unsigned long odd;
-
- if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
- return -EFAULT;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- *fdset++ = h << 32 | l;
- nr -= 2;
- }
- if (odd && __get_user(*fdset, ufdset))
- return -EFAULT;
+ return compat_get_bitmap(fdset, ufdset, nr);
} else {
/* Tricky, must clear full unsigned long in the
- * kernel fdset at the end, this makes sure that
+ * kernel fdset at the end, ALIGN makes sure that
* actually happens.
*/
- memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+ memset(fdset, 0, ALIGN(nr, BITS_PER_LONG));
+ return 0;
}
- return 0;
}
static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
unsigned long *fdset)
{
- unsigned long odd;
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-
if (!ufdset)
return 0;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- l = *fdset++;
- h = l >> 32;
- if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- nr -= 2;
- }
- if (odd && __put_user(*fdset, ufdset))
- return -EFAULT;
- return 0;
+ return compat_put_bitmap(ufdset, fdset, nr);
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ca69fb99e41a..dc7c2be963ed 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
- void *buf;
- gfp_t gfp = GFP_KERNEL;
-
- /*
- * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
- * it's better to fall back to vmalloc() than to kill things. For small
- * allocations, just use GFP_KERNEL which will oom kill, thus no need
- * for vmalloc fallback.
- */
- if (size > PAGE_SIZE)
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- buf = kmalloc(size, gfp);
- if (!buf && size > PAGE_SIZE)
- buf = vmalloc(size);
- return buf;
+ return kvmalloc(size, GFP_KERNEL);
}
/**
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..593b022ac11b 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,12 +38,12 @@ void signalfd_cleanup(struct sighand_struct *sighand)
/*
* The lockless check can race with remove_wait_queue() in progress,
* but in this case its caller should run under rcu_read_lock() and
- * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
*/
if (likely(!waitqueue_active(wqh)))
return;
- /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+ /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
wake_up_poll(wqh, POLLHUP | POLLFREE);
}
diff --git a/fs/splice.c b/fs/splice.c
index 540c4a44756c..ae41201d0325 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -762,7 +762,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
sd.total_len - left);
- ret = vfs_iter_write(out, &from, &sd.pos);
+ ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
diff --git a/fs/stat.c b/fs/stat.c
index f494b182c7c7..c35610845ab1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -672,6 +672,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes)
inode->i_bytes -= 512;
}
}
+EXPORT_SYMBOL(__inode_add_bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes)
{
diff --git a/fs/statfs.c b/fs/statfs.c
index 4e4623c7a126..fab9b6a3c116 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -38,6 +38,8 @@ static int flags_by_sb(int s_flags)
flags |= ST_SYNCHRONOUS;
if (s_flags & MS_MANDLOCK)
flags |= ST_MANDLOCK;
+ if (s_flags & MS_RDONLY)
+ flags |= ST_RDONLY;
return flags;
}
@@ -244,6 +246,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
#ifdef CONFIG_COMPAT
static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
{
+ struct compat_statfs buf;
if (sizeof ubuf->f_blocks == 4) {
if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
@@ -257,20 +260,20 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
&& (kbuf->f_ffree & 0xffffffff00000000ULL))
return -EOVERFLOW;
}
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ memset(&buf, 0, sizeof(struct compat_statfs));
+ buf.f_type = kbuf->f_type;
+ buf.f_bsize = kbuf->f_bsize;
+ buf.f_blocks = kbuf->f_blocks;
+ buf.f_bfree = kbuf->f_bfree;
+ buf.f_bavail = kbuf->f_bavail;
+ buf.f_files = kbuf->f_files;
+ buf.f_ffree = kbuf->f_ffree;
+ buf.f_namelen = kbuf->f_namelen;
+ buf.f_fsid.val[0] = kbuf->f_fsid.val[0];
+ buf.f_fsid.val[1] = kbuf->f_fsid.val[1];
+ buf.f_frsize = kbuf->f_frsize;
+ buf.f_flags = kbuf->f_flags;
+ if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs)))
return -EFAULT;
return 0;
}
@@ -299,6 +302,7 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
+ struct compat_statfs64 buf;
if (sizeof(ubuf->f_bsize) == 4) {
if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
@@ -312,20 +316,20 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
&& (kbuf->f_ffree & 0xffffffff00000000ULL))
return -EOVERFLOW;
}
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ memset(&buf, 0, sizeof(struct compat_statfs64));
+ buf.f_type = kbuf->f_type;
+ buf.f_bsize = kbuf->f_bsize;
+ buf.f_blocks = kbuf->f_blocks;
+ buf.f_bfree = kbuf->f_bfree;
+ buf.f_bavail = kbuf->f_bavail;
+ buf.f_files = kbuf->f_files;
+ buf.f_ffree = kbuf->f_ffree;
+ buf.f_namelen = kbuf->f_namelen;
+ buf.f_fsid.val[0] = kbuf->f_fsid.val[0];
+ buf.f_fsid.val[1] = kbuf->f_fsid.val[1];
+ buf.f_frsize = kbuf->f_frsize;
+ buf.f_flags = kbuf->f_flags;
+ if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64)))
return -EFAULT;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 11ba023434b1..2a54c1f22035 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -192,7 +192,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
spin_unlock(&inode->i_lock);
mark_inode_dirty_sync(inode);
}
- return call_fsync(file, start, end, datasync);
+ return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 5bdae85ceef7..f5191cb2c947 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -45,7 +45,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
mark_inode_dirty(dir);
}
if (IS_DIRSYNC(dir))
- err = write_one_page(page, 1);
+ err = write_one_page(page);
else
unlock_page(page);
return err;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c543cdb5f8ed..ece0c02d7e63 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -169,7 +169,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
}
static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
- const struct itimerspec *ktmr)
+ const struct itimerspec64 *ktmr)
{
enum hrtimer_mode htmode;
ktime_t texp;
@@ -178,10 +178,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
htmode = (flags & TFD_TIMER_ABSTIME) ?
HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
- texp = timespec_to_ktime(ktmr->it_value);
+ texp = timespec64_to_ktime(ktmr->it_value);
ctx->expired = 0;
ctx->ticks = 0;
- ctx->tintv = timespec_to_ktime(ktmr->it_interval);
+ ctx->tintv = timespec64_to_ktime(ktmr->it_interval);
if (isalarm(ctx)) {
alarm_init(&ctx->t.alarm,
@@ -432,16 +432,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
}
static int do_timerfd_settime(int ufd, int flags,
- const struct itimerspec *new,
- struct itimerspec *old)
+ const struct itimerspec64 *new,
+ struct itimerspec64 *old)
{
struct fd f;
struct timerfd_ctx *ctx;
int ret;
if ((flags & ~TFD_SETTIME_FLAGS) ||
- !timespec_valid(&new->it_value) ||
- !timespec_valid(&new->it_interval))
+ !itimerspec64_valid(new))
return -EINVAL;
ret = timerfd_fget(ufd, &f);
@@ -487,8 +486,8 @@ static int do_timerfd_settime(int ufd, int flags,
hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
}
- old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- old->it_interval = ktime_to_timespec(ctx->tintv);
+ old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx));
+ old->it_interval = ktime_to_timespec64(ctx->tintv);
/*
* Re-program the timer to the new value ...
@@ -500,7 +499,7 @@ static int do_timerfd_settime(int ufd, int flags,
return ret;
}
-static int do_timerfd_gettime(int ufd, struct itimerspec *t)
+static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
{
struct fd f;
struct timerfd_ctx *ctx;
@@ -525,8 +524,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
hrtimer_restart(&ctx->t.tmr);
}
}
- t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- t->it_interval = ktime_to_timespec(ctx->tintv);
+ t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx));
+ t->it_interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
fdput(f);
return 0;
@@ -536,15 +535,15 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
const struct itimerspec __user *, utmr,
struct itimerspec __user *, otmr)
{
- struct itimerspec new, old;
+ struct itimerspec64 new, old;
int ret;
- if (copy_from_user(&new, utmr, sizeof(new)))
+ if (get_itimerspec64(&new, utmr))
return -EFAULT;
ret = do_timerfd_settime(ufd, flags, &new, &old);
if (ret)
return ret;
- if (otmr && copy_to_user(otmr, &old, sizeof(old)))
+ if (otmr && put_itimerspec64(&old, otmr))
return -EFAULT;
return ret;
@@ -552,11 +551,11 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
{
- struct itimerspec kotmr;
+ struct itimerspec64 kotmr;
int ret = do_timerfd_gettime(ufd, &kotmr);
if (ret)
return ret;
- return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
+ return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0;
}
#ifdef CONFIG_COMPAT
@@ -564,15 +563,15 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
const struct compat_itimerspec __user *, utmr,
struct compat_itimerspec __user *, otmr)
{
- struct itimerspec new, old;
+ struct itimerspec64 new, old;
int ret;
- if (get_compat_itimerspec(&new, utmr))
+ if (get_compat_itimerspec64(&new, utmr))
return -EFAULT;
ret = do_timerfd_settime(ufd, flags, &new, &old);
if (ret)
return ret;
- if (otmr && put_compat_itimerspec(otmr, &old))
+ if (otmr && put_compat_itimerspec64(&old, otmr))
return -EFAULT;
return ret;
}
@@ -580,10 +579,10 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
struct compat_itimerspec __user *, otmr)
{
- struct itimerspec kotmr;
+ struct itimerspec64 kotmr;
int ret = do_timerfd_gettime(ufd, &kotmr);
if (ret)
return ret;
- return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0;
+ return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0;
}
#endif
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 21d36d284735..328e89c2cf83 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = {
static int trace_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr trace_files[] = {{""}};
+ static const struct tree_descr trace_files[] = {{""}};
struct tracefs_fs_info *fsi;
int err;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index b0d0623c83ed..83a961bf7280 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -61,3 +61,16 @@ config UBIFS_FS_ENCRYPTION
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
decrypted pages in the page cache.
+
+config UBIFS_FS_SECURITY
+ bool "UBIFS Security Labels"
+ depends on UBIFS_FS
+ default y
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the ubifs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 718b749fa11a..7cd8a7b95299 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2391,8 +2391,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
ubifs_dump_node(c, sa->node);
return -EINVAL;
}
- if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
- sa->type != UBIFS_XENT_NODE) {
+ if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE &&
+ sb->type != UBIFS_XENT_NODE) {
ubifs_err(c, "bad node type %d", sb->type);
ubifs_dump_node(c, sb->node);
return -EINVAL;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 8049851cac42..566079d9b402 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
- ubifs_current_time(inode);
+ current_time(inode);
inode->i_mapping->nrpages = 0;
switch (mode & S_IFMT) {
@@ -766,7 +766,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
@@ -841,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
@@ -945,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
@@ -1422,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
old_inode->i_ctime = time;
/* We must adjust parent link count when renaming directories */
@@ -1595,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
lock_4_inodes(old_dir, new_dir, NULL, NULL);
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
fst_inode->i_ctime = time;
snd_inode->i_ctime = time;
old_dir->i_mtime = old_dir->i_ctime = time;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d9ae86f96df7..2cda3d67e2d0 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
*/
static int update_mctime(struct inode *inode)
{
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
@@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index da519ba205f6..fdc311246807 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -53,7 +53,7 @@ void ubifs_set_inode_flags(struct inode *inode)
* ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
* @ioctl_flags: flags to convert
*
- * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * This function converts ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
* (@UBIFS_COMPR_FL, etc).
*/
static int ioctl2ubifs(int ioctl_flags)
@@ -78,8 +78,8 @@ static int ioctl2ubifs(int ioctl_flags)
* ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
* @ubifs_flags: flags to convert
*
- * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
- * (@FS_COMPR_FL, etc).
+ * This function converts UBIFS inode flags (@UBIFS_COMPR_FL, etc) to ioctl
+ * flags (@FS_COMPR_FL, etc).
*/
static int ubifs2ioctl(int ubifs_flags)
{
@@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags)
ui->flags = ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 8ece6ca58c0b..caf83d68fb38 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -225,16 +225,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
}
/**
- * ubifs_current_time - round current time to time granularity.
- * @inode: inode
- */
-static inline struct timespec ubifs_current_time(struct inode *inode)
-{
- return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
- current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
-/**
* ubifs_tnc_lookup - look up a file-system node.
* @c: UBIFS file-system description object
* @key: node key to lookup
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 586d59347fff..3af4472061cc 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -442,7 +442,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
{
int empty_offs, pad_len;
- lnum = lnum;
dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
ubifs_assert(!(*offs & 7));
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 7f1ead29e727..8c25081a5109 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c)
int min_leb_cnt = UBIFS_MIN_LEB_CNT;
long long tmp64, main_bytes;
__le64 tmp_le64;
+ __le32 tmp_le32;
+ struct timespec ts;
/* Some functions called from here depend on the @c->key_len filed */
c->key_len = UBIFS_SK_LEN;
@@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c)
ino->ch.node_type = UBIFS_INO_NODE;
ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
ino->nlink = cpu_to_le32(2);
- tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+
+ ktime_get_real_ts(&ts);
+ ts = timespec_trunc(ts, DEFAULT_TIME_GRAN);
+ tmp_le64 = cpu_to_le64(ts.tv_sec);
ino->atime_sec = tmp_le64;
ino->ctime_sec = tmp_le64;
ino->mtime_sec = tmp_le64;
- ino->atime_nsec = 0;
- ino->ctime_nsec = 0;
- ino->mtime_nsec = 0;
+ tmp_le32 = cpu_to_le32(ts.tv_nsec);
+ ino->atime_nsec = tmp_le32;
+ ino->ctime_nsec = tmp_le32;
+ ino->mtime_nsec = tmp_le32;
ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4da10a6d702a..298b4d89eee9 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1753,13 +1753,23 @@ int ubifs_check_dir_empty(struct inode *dir);
/* xattr.c */
extern const struct xattr_handler *ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int ubifs_init_security(struct inode *dentry, struct inode *inode,
- const struct qstr *qstr);
int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
size_t size, int flags);
ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
size_t size);
+#ifdef CONFIG_UBIFS_FS_SECURITY
+extern int ubifs_init_security(struct inode *dentry, struct inode *inode,
+ const struct qstr *qstr);
+#else
+static inline int ubifs_init_security(struct inode *dentry,
+ struct inode *inode, const struct qstr *qstr)
+{
+ return 0;
+}
+#endif
+
+
/* super.c */
struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index efe00fcb8b75..6c9e62c2ef55 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
@@ -559,6 +559,7 @@ out_free:
return err;
}
+#ifdef CONFIG_UBIFS_FS_SECURITY
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
@@ -599,6 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
}
return err;
}
+#endif
static int xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
@@ -639,15 +641,19 @@ static const struct xattr_handler ubifs_trusted_xattr_handler = {
.set = xattr_set,
};
+#ifdef CONFIG_UBIFS_FS_SECURITY
static const struct xattr_handler ubifs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = xattr_get,
.set = xattr_set,
};
+#endif
const struct xattr_handler *ubifs_xattr_handlers[] = {
&ubifs_user_xattr_handler,
&ubifs_trusted_xattr_handler,
+#ifdef CONFIG_UBIFS_FS_SECURITY
&ubifs_security_xattr_handler,
+#endif
NULL
};
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a0376a2c1c29..f80be4c5df9d 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -82,7 +82,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
ufs_error (sb, "ufs_free_fragments",
"bit already cleared for fragment %u", i);
}
-
+
+ inode_sub_bytes(inode, count << uspi->s_fshift);
fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
uspi->cs_total.cs_nffree += count;
fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -184,6 +185,7 @@ do_more:
ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
}
ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+ inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, 1);
@@ -398,10 +400,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
/*
* There is not enough space for user on the device
*/
- if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
- UFSD("EXIT (FAILED)\n");
- return 0;
+ if (unlikely(ufs_freefrags(uspi) <= uspi->s_root_blocks)) {
+ if (!capable(CAP_SYS_RESOURCE)) {
+ mutex_unlock(&UFS_SB(sb)->s_lock);
+ UFSD("EXIT (FAILED)\n");
+ return 0;
+ }
}
if (goal >= uspi->s_size)
@@ -419,12 +423,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
if (result) {
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
+ *err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
- write_sequnlock(&UFS_I(inode)->meta_lock);
- *err = 0;
UFS_I(inode)->i_lastfrag =
max(UFS_I(inode)->i_lastfrag, fragment + count);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
}
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -437,8 +441,10 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_add_fragments(inode, tmp, oldcount, newcount);
if (result) {
*err = 0;
+ read_seqlock_excl(&UFS_I(inode)->meta_lock);
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
@@ -449,39 +455,29 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
/*
* allocate new block and move data
*/
- switch (fs32_to_cpu(sb, usb1->fs_optim)) {
- case UFS_OPTSPACE:
+ if (fs32_to_cpu(sb, usb1->fs_optim) == UFS_OPTSPACE) {
request = newcount;
- if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
- > uspi->s_dsize * uspi->s_minfree / (2 * 100))
- break;
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
- break;
- default:
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
-
- case UFS_OPTTIME:
+ if (uspi->cs_total.cs_nffree < uspi->s_space_to_time)
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+ } else {
request = uspi->s_fpb;
- if (uspi->cs_total.cs_nffree < uspi->s_dsize *
- (uspi->s_minfree - 2) / 100)
- break;
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
- break;
+ if (uspi->cs_total.cs_nffree > uspi->s_time_to_space)
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE);
}
result = ufs_alloc_fragments (inode, cgno, goal, request, err);
if (result) {
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
uspi->s_sbbase + result, locked_page);
+ *err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
- write_sequnlock(&UFS_I(inode)->meta_lock);
- *err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
@@ -494,6 +490,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
return 0;
}
+static bool try_add_frags(struct inode *inode, unsigned frags)
+{
+ unsigned size = frags * i_blocksize(inode);
+ spin_lock(&inode->i_lock);
+ __inode_add_bytes(inode, size);
+ if (unlikely((u32)inode->i_blocks != inode->i_blocks)) {
+ __inode_sub_bytes(inode, size);
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ spin_unlock(&inode->i_lock);
+ return true;
+}
+
static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
unsigned oldcount, unsigned newcount)
{
@@ -530,6 +540,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
for (i = oldcount; i < newcount; i++)
if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
return 0;
+
+ if (!try_add_frags(inode, count))
+ return 0;
/*
* Block can be extended
*/
@@ -647,6 +660,7 @@ cg_found:
ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
i = uspi->s_fpb - count;
+ inode_sub_bytes(inode, i << uspi->s_fshift);
fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
uspi->cs_total.cs_nffree += i;
fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
@@ -657,6 +671,8 @@ cg_found:
result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
if (result == INVBLOCK)
return 0;
+ if (!try_add_frags(inode, count))
+ return 0;
for (i = 0; i < count; i++)
ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
@@ -716,6 +732,8 @@ norot:
return INVBLOCK;
ucpi->c_rotor = result;
gotit:
+ if (!try_add_frags(inode, uspi->s_fpb))
+ return 0;
blkno = ufs_fragstoblks(result);
ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index de01b8f2aa78..48609f1d9580 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -53,7 +53,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
mark_inode_dirty(dir);
}
if (IS_DIRSYNC(dir))
- err = write_one_page(page, 1);
+ err = write_one_page(page);
else
unlock_page(page);
return err;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9774555b3721..d1dd8cc33179 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
struct inode * inode;
+ struct timespec64 ts;
unsigned cg, bit, i, j, start;
struct ufs_inode_info *ufsi;
int err = -ENOSPC;
@@ -323,8 +324,9 @@ cg_found:
lock_buffer(bh);
ufs2_inode = (struct ufs2_inode *)bh->b_data;
ufs2_inode += ufs_inotofsbo(inode->i_ino);
- ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
- ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
+ ktime_get_real_ts64(&ts);
+ ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec);
+ ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
mark_buffer_dirty(bh);
unlock_buffer(bh);
if (sb->s_flags & MS_SYNCHRONOUS)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7e41aee7b69a..f36d6a53687d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
p = ufs_get_direct_data_ptr(uspi, ufsi, block);
tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
- new_size, err, locked_page);
+ new_size - (lastfrag & uspi->s_fpbmask), err,
+ locked_page);
return tmp != 0;
}
@@ -284,7 +285,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
goal += uspi->s_fpb;
}
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
- goal, uspi->s_fpb, err, locked_page);
+ goal, nfrags, err, locked_page);
if (!tmp) {
*err = -ENOSPC;
@@ -400,11 +401,20 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
u64 phys64 = 0;
unsigned frag = fragment & uspi->s_fpbmask;
- if (!create) {
- phys64 = ufs_frag_map(inode, offsets, depth);
- goto out;
- }
+ phys64 = ufs_frag_map(inode, offsets, depth);
+ if (!create)
+ goto done;
+ if (phys64) {
+ if (fragment >= UFS_NDIR_FRAGMENT)
+ goto done;
+ read_seqlock_excl(&UFS_I(inode)->meta_lock);
+ if (fragment < UFS_I(inode)->i_lastfrag) {
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
+ goto done;
+ }
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
+ }
/* This code entered only while writing ....? */
mutex_lock(&UFS_I(inode)->truncate_mutex);
@@ -448,6 +458,11 @@ out:
}
mutex_unlock(&UFS_I(inode)->truncate_mutex);
return err;
+
+done:
+ if (phys64)
+ map_bh(bh_result, sb, phys64 + frag);
+ return 0;
}
static int ufs_writepage(struct page *page, struct writeback_control *wbc)
@@ -551,10 +566,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
- if (inode->i_nlink == 0) {
- ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
- return -1;
- }
+ if (inode->i_nlink == 0)
+ return -ESTALE;
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
@@ -563,9 +576,9 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
- inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
- inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
- inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
+ inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+ inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+ inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_ctime.tv_nsec = 0;
@@ -599,10 +612,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
- if (inode->i_nlink == 0) {
- ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
- return -1;
- }
+ if (inode->i_nlink == 0)
+ return -ESTALE;
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
@@ -642,7 +653,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct buffer_head * bh;
struct inode *inode;
- int err;
+ int err = -EIO;
UFSD("ENTER, ino %lu\n", ino);
@@ -677,9 +688,10 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
err = ufs1_read_inode(inode,
ufs_inode + ufs_inotofsbo(inode->i_ino));
}
-
+ brelse(bh);
if (err)
goto bad_inode;
+
inode->i_version++;
ufsi->i_lastfrag =
(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -688,15 +700,13 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
ufs_set_inode_ops(inode);
- brelse(bh);
-
UFSD("EXIT\n");
unlock_new_inode(inode);
return inode;
bad_inode:
iget_failed(inode);
- return ERR_PTR(-EIO);
+ return ERR_PTR(err);
}
static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
@@ -841,8 +851,11 @@ void ufs_evict_inode(struct inode * inode)
truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
inode->i_size = 0;
- if (inode->i_blocks)
+ if (inode->i_blocks &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
ufs_truncate_blocks(inode);
+ ufs_update_inode(inode, inode_needs_sync(inode));
}
invalidate_inode_buffers(inode);
@@ -868,7 +881,6 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
ctx->to = from + count;
}
-#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
static void ufs_trunc_direct(struct inode *inode)
@@ -1100,25 +1112,30 @@ out:
return err;
}
-static void __ufs_truncate_blocks(struct inode *inode)
+static void ufs_truncate_blocks(struct inode *inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
unsigned offsets[4];
- int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets);
+ int depth;
int depth2;
unsigned i;
struct ufs_buffer_head *ubh[3];
void *p;
u64 block;
- if (!depth)
- return;
+ if (inode->i_size) {
+ sector_t last = (inode->i_size - 1) >> uspi->s_bshift;
+ depth = ufs_block_to_path(inode, last, offsets);
+ if (!depth)
+ return;
+ } else {
+ depth = 1;
+ }
- /* find the last non-zero in offsets[] */
for (depth2 = depth - 1; depth2; depth2--)
- if (offsets[depth2])
+ if (offsets[depth2] != uspi->s_apb - 1)
break;
mutex_lock(&ufsi->truncate_mutex);
@@ -1127,9 +1144,8 @@ static void __ufs_truncate_blocks(struct inode *inode)
offsets[0] = UFS_IND_BLOCK;
} else {
/* get the blocks that should be partially emptied */
- p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]);
+ p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++);
for (i = 0; i < depth2; i++) {
- offsets[i]++; /* next branch is fully freed */
block = ufs_data_ptr_to_cpu(sb, p);
if (!block)
break;
@@ -1140,7 +1156,7 @@ static void __ufs_truncate_blocks(struct inode *inode)
write_sequnlock(&ufsi->meta_lock);
break;
}
- p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]);
+ p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]++);
}
while (i--)
free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1);
@@ -1155,7 +1171,9 @@ static void __ufs_truncate_blocks(struct inode *inode)
free_full_branch(inode, block, i - UFS_IND_BLOCK + 1);
}
}
+ read_seqlock_excl(&ufsi->meta_lock);
ufsi->i_lastfrag = DIRECT_FRAGMENT;
+ read_sequnlock_excl(&ufsi->meta_lock);
mark_inode_dirty(inode);
mutex_unlock(&ufsi->truncate_mutex);
}
@@ -1183,7 +1201,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
- __ufs_truncate_blocks(inode);
+ ufs_truncate_blocks(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
out:
@@ -1191,16 +1209,6 @@ out:
return err;
}
-static void ufs_truncate_blocks(struct inode *inode)
-{
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
- __ufs_truncate_blocks(inode);
-}
-
int ufs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 131b2b77c818..0a4f58a5073c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -480,7 +480,7 @@ static void ufs_setup_cstotal(struct super_block *sb)
usb3 = ubh_get_usb_third(uspi);
if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
- (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+ (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) ||
mtype == UFS_MOUNT_UFSTYPE_UFS2) {
/*we have statistic in different place, then usual*/
uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
@@ -596,9 +596,7 @@ static void ufs_put_cstotal(struct super_block *sb)
usb2 = ubh_get_usb_second(uspi);
usb3 = ubh_get_usb_third(uspi);
- if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
- (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
- mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+ if (mtype == UFS_MOUNT_UFSTYPE_UFS2) {
/*we have statistic in different place, then usual*/
usb2->fs_un.fs_u2.cs_ndir =
cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
@@ -608,16 +606,26 @@ static void ufs_put_cstotal(struct super_block *sb)
cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
usb3->fs_un1.fs_u2.cs_nffree =
cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
- } else {
- usb1->fs_cstotal.cs_ndir =
- cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
- usb1->fs_cstotal.cs_nbfree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
- usb1->fs_cstotal.cs_nifree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
- usb1->fs_cstotal.cs_nffree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+ goto out;
+ }
+
+ if (mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+ (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) {
+ /* store stats in both old and new places */
+ usb2->fs_un.fs_u2.cs_ndir =
+ cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+ usb2->fs_un.fs_u2.cs_nbfree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+ usb3->fs_un1.fs_u2.cs_nifree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+ usb3->fs_un1.fs_u2.cs_nffree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
}
+ usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+ usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+ usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+ usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+out:
ubh_mark_buffer_dirty(USPI_UBH(uspi));
ufs_print_super_stuff(sb, usb1, usb2, usb3);
UFSD("EXIT\n");
@@ -746,6 +754,23 @@ static void ufs_put_super(struct super_block *sb)
return;
}
+static u64 ufs_max_bytes(struct super_block *sb)
+{
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ int bits = uspi->s_apbshift;
+ u64 res;
+
+ if (bits > 21)
+ res = ~0ULL;
+ else
+ res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) +
+ (1LL << (3*bits));
+
+ if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift))
+ return MAX_LFS_FILESIZE;
+ return res << uspi->s_bshift;
+}
+
static int ufs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ufs_sb_info * sbi;
@@ -812,9 +837,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
uspi->s_dirblksize = UFS_SECTOR_SIZE;
super_block_offset=UFS_SBLOCK;
- /* Keep 2Gig file limit. Some UFS variants need to override
- this but as I don't know which I'll let those in the know loosen
- the rules */
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
case UFS_MOUNT_UFSTYPE_44BSD:
UFSD("ufstype=44bsd\n");
@@ -980,6 +1004,13 @@ again:
flags |= UFS_ST_SUN;
}
+ if ((flags & UFS_ST_MASK) == UFS_ST_44BSD &&
+ uspi->s_postblformat == UFS_42POSTBLFMT) {
+ if (!silent)
+ pr_err("this is not a 44bsd filesystem");
+ goto failed;
+ }
+
/*
* Check ufs magic number
*/
@@ -1127,8 +1158,8 @@ magic_found:
uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
- uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
- uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+ uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+ uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
} else {
uspi->s_size = fs32_to_cpu(sb, usb1->fs_size);
uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize);
@@ -1177,6 +1208,18 @@ magic_found:
uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff);
uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff);
+ uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize,
+ uspi->s_minfree, 100);
+ if (uspi->s_minfree <= 5) {
+ uspi->s_time_to_space = ~0ULL;
+ uspi->s_space_to_time = 0;
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE);
+ } else {
+ uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1;
+ uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize,
+ uspi->s_minfree - 2, 100) - 1;
+ }
+
/*
* Compute another frequently used values
*/
@@ -1212,6 +1255,7 @@ magic_found:
"fast symlink size (%u)\n", uspi->s_maxsymlinklen);
uspi->s_maxsymlinklen = maxsymlen;
}
+ sb->s_maxbytes = ufs_max_bytes(sb);
sb->s_max_links = UFS_LINK_MAX;
inode = ufs_iget(sb, UFS_ROOTINO);
@@ -1365,19 +1409,17 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
mutex_lock(&UFS_SB(sb)->s_lock);
usb3 = ubh_get_usb_third(uspi);
- if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+ if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
buf->f_type = UFS2_MAGIC;
- buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
- } else {
+ else
buf->f_type = UFS_MAGIC;
- buf->f_blocks = uspi->s_dsize;
- }
- buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
- uspi->cs_total.cs_nffree;
+
+ buf->f_blocks = uspi->s_dsize;
+ buf->f_bfree = ufs_freefrags(uspi);
buf->f_ffree = uspi->cs_total.cs_nifree;
buf->f_bsize = sb->s_blocksize;
- buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
- ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
+ buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks)
+ ? (buf->f_bfree - uspi->s_root_blocks) : 0;
buf->f_files = uspi->s_ncg * uspi->s_ipg;
buf->f_namelen = UFS_MAXNAMLEN;
buf->f_fsid.val[0] = (u32)id;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 0cbd5d340b67..150eef6f1233 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -733,10 +733,8 @@ struct ufs_sb_private_info {
__u32 s_dblkno; /* offset of first data after cg */
__u32 s_cgoffset; /* cylinder group offset in cylinder */
__u32 s_cgmask; /* used to calc mod fs_ntrak */
- __u32 s_size; /* number of blocks (fragments) in fs */
- __u32 s_dsize; /* number of data blocks in fs */
- __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */
- __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */
+ __u64 s_size; /* number of blocks (fragments) in fs */
+ __u64 s_dsize; /* number of data blocks in fs */
__u32 s_ncg; /* number of cylinder groups */
__u32 s_bsize; /* size of basic blocks */
__u32 s_fsize; /* size of fragments */
@@ -793,6 +791,9 @@ struct ufs_sb_private_info {
__u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */
__s32 fs_magic; /* filesystem magic */
unsigned int s_dirblksize;
+ __u64 s_root_blocks;
+ __u64 s_time_to_space;
+ __u64 s_space_to_time;
};
/*
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index f41ad0a6106f..02497a492eb2 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -243,9 +243,8 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
struct page *ufs_get_locked_page(struct address_space *mapping,
pgoff_t index)
{
- struct page *page;
-
- page = find_lock_page(mapping, index);
+ struct inode *inode = mapping->host;
+ struct page *page = find_lock_page(mapping, index);
if (!page) {
page = read_mapping_page(mapping, index, NULL);
@@ -253,7 +252,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
printk(KERN_ERR "ufs_change_blocknr: "
"read_mapping_page error: ino %lu, index: %lu\n",
mapping->host->i_ino, index);
- goto out;
+ return page;
}
lock_page(page);
@@ -262,8 +261,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
/* Truncate got there first */
unlock_page(page);
put_page(page);
- page = NULL;
- goto out;
+ return NULL;
}
if (!PageUptodate(page) || PageError(page)) {
@@ -272,11 +270,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
printk(KERN_ERR "ufs_change_blocknr: "
"can not read page: ino %lu, index: %lu\n",
- mapping->host->i_ino, index);
+ inode->i_ino, index);
- page = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
}
-out:
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << inode->i_blkbits, 0);
return page;
}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index b7fbf53dbc81..9fc7119a1551 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -350,16 +350,11 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi,
#define ubh_blkmap(ubh,begin,bit) \
((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
-/*
- * Determine the number of available frags given a
- * percentage to hold in reserve.
- */
static inline u64
-ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+ufs_freefrags(struct ufs_sb_private_info *uspi)
{
return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
- uspi->cs_total.cs_nffree -
- (uspi->s_dsize * (percentreserved) / 100);
+ uspi->cs_total.cs_nffree;
}
/*
@@ -473,15 +468,19 @@ static inline unsigned _ubh_find_last_zero_bit_(
static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
{
+ u8 mask;
switch (uspi->s_fpb) {
case 8:
return (*ubh_get_addr (ubh, begin + block) == 0xff);
case 4:
- return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2)));
+ mask = 0x0f << ((block & 0x01) << 2);
+ return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask;
case 2:
- return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1)));
+ mask = 0x03 << ((block & 0x03) << 1);
+ return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask;
case 1:
- return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07)));
+ mask = 0x01 << (block & 0x07);
+ return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask;
}
return 0;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f7555fc25877..cadcd12a3d35 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx {
struct userfaultfd_wait_queue {
struct uffd_msg msg;
- wait_queue_t wq;
+ wait_queue_entry_t wq;
struct userfaultfd_ctx *ctx;
bool waken;
};
@@ -91,7 +91,7 @@ struct userfaultfd_wake_range {
unsigned long len;
};
-static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
struct userfaultfd_wake_range *range = key;
@@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
* wouldn't be enough, the smp_mb__before_spinlock is
* enough to avoid an explicit smp_mb() here.
*/
- list_del_init(&wq->task_list);
+ list_del_init(&wq->entry);
out:
return ret;
}
@@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
@@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- pte = huge_pte_offset(mm, address);
+ pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
if (!pte)
goto out;
@@ -243,6 +244,7 @@ out:
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
@@ -340,9 +342,28 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
bool must_wait, return_to_userland;
long blocking_state;
- BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-
ret = VM_FAULT_SIGBUS;
+
+ /*
+ * We don't do userfault handling for the final child pid update.
+ *
+ * We also don't do userfault handling during
+ * coredumping. hugetlbfs has the special
+ * follow_hugetlb_page() to skip missing pages in the
+ * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
+ * the no_page_table() helper in follow_page_mask(), but the
+ * shmem_vm_ops->fault method is invoked even during
+ * coredumping without mmap_sem and it ends up here.
+ */
+ if (current->flags & (PF_EXITING|PF_DUMPCORE))
+ goto out;
+
+ /*
+ * Coredumping runs without mmap_sem so we can only check that
+ * the mmap_sem is held, if PF_DUMPCORE was not set.
+ */
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -361,12 +382,6 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * We don't do userfault handling for the final child pid update.
- */
- if (current->flags & PF_EXITING)
- goto out;
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -435,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+ vmf->address,
vmf->flags, reason);
up_read(&mm->mmap_sem);
@@ -509,13 +525,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* and it's fine not to block on the spinlock. The uwq on this
* kernel stack can be released after the list_del_init.
*/
- if (!list_empty_careful(&uwq.wq.task_list)) {
+ if (!list_empty_careful(&uwq.wq.entry)) {
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway.
*/
- list_del(&uwq.wq.task_list);
+ list_del(&uwq.wq.entry);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
@@ -847,7 +863,7 @@ wakeup:
static inline struct userfaultfd_wait_queue *find_userfault_in(
wait_queue_head_t *wqh)
{
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
VM_BUG_ON(!spin_is_locked(&wqh->lock));
@@ -856,7 +872,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in(
if (!waitqueue_active(wqh))
goto out;
/* walk in reverse to provide FIFO behavior to read userfaults */
- wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
+ wq = list_last_entry(&wqh->head, typeof(*wq), entry);
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
return uwq;
@@ -990,14 +1006,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
* changes __remove_wait_queue() to use
* list_del_init() in turn breaking the
* !list_empty_careful() check in
- * handle_userfault(). The uwq->wq.task_list
+ * handle_userfault(). The uwq->wq.head list
* must never be empty at any time during the
* refile, or the waitqueue could disappear
* from under us. The "wait_queue_head_t"
* parameter of __remove_wait_queue() is unused
* anyway.
*/
- list_del(&uwq->wq.task_list);
+ list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
@@ -1019,7 +1035,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
fork_nctx = (struct userfaultfd_ctx *)
(unsigned long)
uwq->msg.arg.reserved.reserved1;
- list_move(&uwq->wq.task_list, &fork_event);
+ list_move(&uwq->wq.entry, &fork_event);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
@@ -1056,8 +1072,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!list_empty(&fork_event)) {
uwq = list_first_entry(&fork_event,
typeof(*uwq),
- wq.task_list);
- list_del(&uwq->wq.task_list);
+ wq.entry);
+ list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->event_wqh, &uwq->wq);
userfaultfd_event_complete(ctx, uwq);
}
@@ -1101,11 +1117,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
static void __wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
- unsigned long start, end;
-
- start = range->start;
- end = range->start + range->len;
-
spin_lock(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
@@ -1734,17 +1745,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
- list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
- list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 7e3317cf4045..464c94bf65f9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvmalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
goto out;
@@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
size = XATTR_SIZE_MAX;
- kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvzalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
}
error = vfs_getxattr(d, kname, kvalue, size);
@@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
- klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
- if (!klist) {
- klist = vmalloc(size);
- if (!klist)
- return -ENOMEM;
- }
+ klist = kvmalloc(size, GFP_KERNEL);
+ if (!klist)
+ return -ENOMEM;
}
error = vfs_listxattr(d, klist, size);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35faf128f36d..1b98cfa342ab 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -96,3 +96,16 @@ config XFS_DEBUG
not useful unless you are debugging a particular problem.
Say N unless you are an XFS developer, or you play one on TV.
+
+config XFS_ASSERT_FATAL
+ bool "XFS fatal asserts"
+ default y
+ depends on XFS_FS && XFS_DEBUG
+ help
+ Set the default DEBUG mode ASSERT failure behavior.
+
+ Say Y here to cause DEBUG mode ASSERT failures to result in fatal
+ errors that BUG() the kernel by default. If you say N, ASSERT failures
+ result in warnings.
+
+ This behavior can be modified at runtime via sysfs.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5c90f82b8f6b..a6e955bfead8 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,8 +98,7 @@ xfs-y += xfs_aops.o \
xfs_sysfs.o \
xfs_trans.o \
xfs_xattr.o \
- kmem.o \
- uuid.o
+ kmem.o
# low-level transaction/log code
xfs-y += xfs_log.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 780fc8986dab..393b6849aeb3 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
- ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
if (flags & KM_NOFS)
memalloc_nofs_restore(nofs_flag);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 33db69be4832..b008ff3250eb 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -111,8 +111,7 @@ xfs_ag_resv_critical(
/* Critically low if less than 10% or max btree height remains. */
return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
- pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL,
- XFS_RANDOM_AG_RESV_CRITICAL);
+ pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
}
/*
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7486401ccbd3..744dcaec34cc 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -606,7 +606,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
/*
* Read in the allocation group free block array.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_read_agfl(
xfs_mount_t *mp, /* mount point structure */
xfs_trans_t *tp, /* transaction pointer */
@@ -2454,8 +2454,7 @@ xfs_agf_read_verify(
!xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
xfs_buf_ioerror(bp, -EFSBADCRC);
else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
- XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))
+ XFS_ERRTAG_ALLOC_READ_AGF))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
@@ -2842,8 +2841,7 @@ xfs_free_extent(
ASSERT(type != XFS_AG_RESV_AGFL);
if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_FREE_EXTENT,
- XFS_RANDOM_FREE_EXTENT))
+ XFS_ERRTAG_FREE_EXTENT))
return -EIO;
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 77d9c27330ab..ef26edc2e938 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -213,6 +213,8 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, struct xfs_buf **bpp);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **agbp);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index e1fcfe7f0a9a..cfde0a0f9706 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -253,7 +253,7 @@ xfs_allocbt_init_ptr_from_cur(
ptr->s = agf->agf_roots[cur->bc_btnum];
}
-STATIC __int64_t
+STATIC int64_t
xfs_bnobt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
@@ -261,42 +261,42 @@ xfs_bnobt_key_diff(
xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
xfs_alloc_key_t *kp = &key->alloc;
- return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-STATIC __int64_t
+STATIC int64_t
xfs_cntbt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
xfs_alloc_key_t *kp = &key->alloc;
- __int64_t diff;
+ int64_t diff;
- diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+ diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
return diff;
- return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-STATIC __int64_t
+STATIC int64_t
xfs_bnobt_diff_two_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
be32_to_cpu(k2->alloc.ar_startblock);
}
-STATIC __int64_t
+STATIC int64_t
xfs_cntbt_diff_two_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- __int64_t diff;
+ int64_t diff;
diff = be32_to_cpu(k1->alloc.ar_blockcount) -
be32_to_cpu(k2->alloc.ar_blockcount);
@@ -395,7 +395,6 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
};
-#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_bnobt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -442,7 +441,6 @@ xfs_cntbt_recs_inorder(
be32_to_cpu(r1->alloc.ar_startblock) <
be32_to_cpu(r2->alloc.ar_startblock));
}
-#endif /* DEBUG */
static const struct xfs_btree_ops xfs_bnobt_ops = {
.rec_len = sizeof(xfs_alloc_rec_t),
@@ -462,10 +460,8 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.key_diff = xfs_bnobt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
-#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
-#endif
};
static const struct xfs_btree_ops xfs_cntbt_ops = {
@@ -486,10 +482,8 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.key_diff = xfs_cntbt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
-#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
-#endif
};
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 6622d46ddec3..ef8a1c75a467 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -114,6 +114,23 @@ xfs_inode_hasattr(
* Overall external interface routines.
*========================================================================*/
+/* Retrieve an extended attribute and its value. Must have iolock. */
+int
+xfs_attr_get_ilocked(
+ struct xfs_inode *ip,
+ struct xfs_da_args *args)
+{
+ if (!xfs_inode_hasattr(ip))
+ return -ENOATTR;
+ else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_shortform_getvalue(args);
+ else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+ return xfs_attr_leaf_get(args);
+ else
+ return xfs_attr_node_get(args);
+}
+
+/* Retrieve an extended attribute by name, and its value. */
int
xfs_attr_get(
struct xfs_inode *ip,
@@ -141,14 +158,7 @@ xfs_attr_get(
args.op_flags = XFS_DA_OP_OKNOENT;
lock_mode = xfs_ilock_attr_map_shared(ip);
- if (!xfs_inode_hasattr(ip))
- error = -ENOATTR;
- else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
- error = xfs_attr_shortform_getvalue(&args);
- else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
- error = xfs_attr_leaf_get(&args);
- else
- error = xfs_attr_node_get(&args);
+ error = xfs_attr_get_ilocked(ip, &args);
xfs_iunlock(ip, lock_mode);
*valuelenp = args.valuelen;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2852521fc8ec..c6c15e5717e4 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d52f525f5b2d..5236d8e45146 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -253,7 +253,7 @@ xfs_attr_rmtval_copyout(
xfs_ino_t ino,
int *offset,
int *valuelen,
- __uint8_t **dst)
+ uint8_t **dst)
{
char *src = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
@@ -301,7 +301,7 @@ xfs_attr_rmtval_copyin(
xfs_ino_t ino,
int *offset,
int *valuelen,
- __uint8_t **src)
+ uint8_t **src)
{
char *dst = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
@@ -355,7 +355,7 @@ xfs_attr_rmtval_get(
struct xfs_mount *mp = args->dp->i_mount;
struct xfs_buf *bp;
xfs_dablk_t lblkno = args->rmtblkno;
- __uint8_t *dst = args->value;
+ uint8_t *dst = args->value;
int valuelen;
int nmap;
int error;
@@ -386,7 +386,8 @@ xfs_attr_rmtval_get(
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, args->trans,
+ mp->m_ddev_targp,
dblkno, dblkcnt, 0, &bp,
&xfs_attr3_rmt_buf_ops);
if (error)
@@ -395,7 +396,7 @@ xfs_attr_rmtval_get(
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
- xfs_buf_relse(bp);
+ xfs_trans_brelse(args->trans, bp);
if (error)
return error;
@@ -421,7 +422,7 @@ xfs_attr_rmtval_set(
struct xfs_bmbt_irec map;
xfs_dablk_t lblkno;
xfs_fileoff_t lfileoff = 0;
- __uint8_t *src = args->value;
+ uint8_t *src = args->value;
int blkcnt;
int valuelen;
int nmap;
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 90928bbe693c..afd684ae3136 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -31,10 +31,10 @@ typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
* We generate this then sort it, attr_list() must return things in hash-order.
*/
typedef struct xfs_attr_sf_sort {
- __uint8_t entno; /* entry number in original list */
- __uint8_t namelen; /* length of name value (no null) */
- __uint8_t valuelen; /* length of value */
- __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ uint8_t entno; /* entry number in original list */
+ uint8_t namelen; /* length of name value (no null) */
+ uint8_t valuelen; /* length of value */
+ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
xfs_dahash_t hash; /* this entry's hash value */
unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
@@ -42,7 +42,7 @@ typedef struct xfs_attr_sf_sort {
#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
(((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
- ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index e1649c0d3e02..61c6b2025d0c 100644
--- a/fs/xfs/libxfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -25,47 +25,47 @@
/*
* masks with n high/low bits set, 64-bit values
*/
-static inline __uint64_t xfs_mask64hi(int n)
+static inline uint64_t xfs_mask64hi(int n)
{
- return (__uint64_t)-1 << (64 - (n));
+ return (uint64_t)-1 << (64 - (n));
}
-static inline __uint32_t xfs_mask32lo(int n)
+static inline uint32_t xfs_mask32lo(int n)
{
- return ((__uint32_t)1 << (n)) - 1;
+ return ((uint32_t)1 << (n)) - 1;
}
-static inline __uint64_t xfs_mask64lo(int n)
+static inline uint64_t xfs_mask64lo(int n)
{
- return ((__uint64_t)1 << (n)) - 1;
+ return ((uint64_t)1 << (n)) - 1;
}
/* Get high bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_highbit32(__uint32_t v)
+static inline int xfs_highbit32(uint32_t v)
{
return fls(v) - 1;
}
/* Get high bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_highbit64(__uint64_t v)
+static inline int xfs_highbit64(uint64_t v)
{
return fls64(v) - 1;
}
/* Get low bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_lowbit32(__uint32_t v)
+static inline int xfs_lowbit32(uint32_t v)
{
return ffs(v) - 1;
}
/* Get low bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_lowbit64(__uint64_t v)
+static inline int xfs_lowbit64(uint64_t v)
{
- __uint32_t w = (__uint32_t)v;
+ uint32_t w = (uint32_t)v;
int n = 0;
if (w) { /* lower bits */
n = ffs(w);
} else { /* upper bits */
- w = (__uint32_t)(v >> 32);
+ w = (uint32_t)(v >> 32);
if (w) {
n = ffs(w);
if (n)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f02eb7673392..0a9880777c9c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1280,7 +1280,6 @@ xfs_bmap_read_extents(
xfs_bmbt_rec_t *frp;
xfs_fsblock_t nextbno;
xfs_extnum_t num_recs;
- xfs_extnum_t start;
num_recs = xfs_btree_get_numrecs(block);
if (unlikely(i + num_recs > room)) {
@@ -1303,7 +1302,6 @@ xfs_bmap_read_extents(
* Copy records into the extent records.
*/
frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- start = i;
for (j = 0; j < num_recs; j++, i++, frp++) {
xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
trp->l0 = be64_to_cpu(frp->l0);
@@ -2065,8 +2063,10 @@ xfs_bmap_add_extent_delay_real(
}
temp = xfs_bmap_worst_indlen(bma->ip, temp);
temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
- diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ diff = (int)(temp + temp2 -
+ (startblockval(PREV.br_startblock) -
+ (bma->cur ?
+ bma->cur->bc_private.b.allocated : 0)));
if (diff > 0) {
error = xfs_mod_fdblocks(bma->ip->i_mount,
-((int64_t)diff), false);
@@ -2123,7 +2123,6 @@ xfs_bmap_add_extent_delay_real(
temp = da_new;
if (bma->cur)
temp += bma->cur->bc_private.b.allocated;
- ASSERT(temp <= da_old);
if (temp < da_old)
xfs_mod_fdblocks(bma->ip->i_mount,
(int64_t)(da_old - temp), false);
@@ -3993,7 +3992,7 @@ xfs_bmapi_read(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -4474,7 +4473,7 @@ xfs_bmapi_write(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -4695,7 +4694,7 @@ xfs_bmapi_remap(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -5435,6 +5434,7 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
+ xfs_fileoff_t max_len;
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5456,6 +5456,16 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
+ /*
+ * Guesstimate how many blocks we can unmap without running the risk of
+ * blowing out the transaction with a mix of EFIs and reflink
+ * adjustments.
+ */
+ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+ max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
+ else
+ max_len = len;
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5500,7 +5510,7 @@ __xfs_bunmapi(
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
- (nexts == 0 || extno < nexts)) {
+ (nexts == 0 || extno < nexts) && max_len > 0) {
/*
* Is the found extent after a hole in which bno lives?
* Just back up to the previous extent, if so.
@@ -5532,6 +5542,15 @@ __xfs_bunmapi(
}
if (del.br_startoff + del.br_blockcount > bno + 1)
del.br_blockcount = bno + 1 - del.br_startoff;
+
+ /* How much can we safely unmap? */
+ if (max_len < del.br_blockcount) {
+ del.br_startoff += del.br_blockcount - max_len;
+ if (!wasdel)
+ del.br_startblock += del.br_blockcount - max_len;
+ del.br_blockcount = max_len;
+ }
+
sum = del.br_startblock + del.br_blockcount;
if (isrt &&
(mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
@@ -5708,6 +5727,7 @@ __xfs_bunmapi(
if (!isrt && wasdel)
xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+ max_len -= del.br_blockcount;
bno = del.br_startoff - 1;
nodelete:
/*
@@ -6078,7 +6098,7 @@ xfs_bmap_shift_extents(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmap_shift_extents",
XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
@@ -6230,7 +6250,7 @@ xfs_bmap_split_extent_at(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
@@ -6473,33 +6493,33 @@ xfs_bmap_finish_one(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
+ xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
- int error = 0, done;
+ xfs_fsblock_t firstfsb;
+ int error = 0;
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
- ip->i_ino, whichfork, startoff, blockcount, state);
+ ip->i_ino, whichfork, startoff, *blockcount, state);
if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
if (XFS_TEST_ERROR(false, tp->t_mountp,
- XFS_ERRTAG_BMAP_FINISH_ONE,
- XFS_RANDOM_BMAP_FINISH_ONE))
+ XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (type) {
case XFS_BMAP_MAP:
- error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+ error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
startblock, dfops);
+ *blockcount = 0;
break;
case XFS_BMAP_UNMAP:
- error = xfs_bunmapi(tp, ip, startoff, blockcount,
- XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
- ASSERT(done);
+ error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index c35a14fa1527..851982a5dfbc 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -271,7 +271,7 @@ struct xfs_bmap_intent {
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, enum xfs_bmap_intent_type type,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t blockcount, xfs_exntst_t state);
+ xfs_filblks_t *blockcount, xfs_exntst_t state);
int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6cba69aff077..85de22513014 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -94,8 +94,8 @@ xfs_bmdr_to_bmbt(
*/
STATIC void
__xfs_bmbt_get_all(
- __uint64_t l0,
- __uint64_t l1,
+ uint64_t l0,
+ uint64_t l1,
xfs_bmbt_irec_t *s)
{
int ext_flag;
@@ -573,6 +573,16 @@ xfs_bmbt_init_key_from_rec(
}
STATIC void
+xfs_bmbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff = cpu_to_be64(
+ xfs_bmbt_disk_get_startoff(&rec->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&rec->bmbt) - 1);
+}
+
+STATIC void
xfs_bmbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -588,15 +598,25 @@ xfs_bmbt_init_ptr_from_cur(
ptr->l = 0;
}
-STATIC __int64_t
+STATIC int64_t
xfs_bmbt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
- return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+ return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
cur->bc_rec.b.br_startoff;
}
+STATIC int64_t
+xfs_bmbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
+ be64_to_cpu(k2->bmbt.br_startoff);
+}
+
static bool
xfs_bmbt_verify(
struct xfs_buf *bp)
@@ -687,7 +707,6 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
};
-#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_bmbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -708,7 +727,6 @@ xfs_bmbt_recs_inorder(
xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
xfs_bmbt_disk_get_startoff(&r2->bmbt);
}
-#endif /* DEBUG */
static const struct xfs_btree_ops xfs_bmbt_ops = {
.rec_len = sizeof(xfs_bmbt_rec_t),
@@ -722,14 +740,14 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.get_minrecs = xfs_bmbt_get_minrecs,
.get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
+ .diff_two_keys = xfs_bmbt_diff_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
-#endif
};
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5392674bf893..4da85fff69ad 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -43,7 +43,7 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
-static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
XFS_FIBT_MAGIC, 0 },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
@@ -51,12 +51,12 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
XFS_REFC_CRC_MAGIC }
};
-__uint32_t
+uint32_t
xfs_btree_magic(
int crc,
xfs_btnum_t btnum)
{
- __uint32_t magic = xfs_magics[crc][btnum];
+ uint32_t magic = xfs_magics[crc][btnum];
/* Ensure we asked for crc for crc-only magics. */
ASSERT(magic != 0);
@@ -101,8 +101,7 @@ xfs_btree_check_lblock(
be64_to_cpu(block->bb_u.l.bb_rightsib)));
if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
- XFS_ERRTAG_BTREE_CHECK_LBLOCK,
- XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+ XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
@@ -153,8 +152,7 @@ xfs_btree_check_sblock(
block->bb_u.s.bb_rightsib;
if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
- XFS_ERRTAG_BTREE_CHECK_SBLOCK,
- XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+ XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
@@ -568,7 +566,7 @@ xfs_btree_ptr_offset(
/*
* Return a pointer to the n-th record in the btree block.
*/
-STATIC union xfs_btree_rec *
+union xfs_btree_rec *
xfs_btree_rec_addr(
struct xfs_btree_cur *cur,
int n,
@@ -581,7 +579,7 @@ xfs_btree_rec_addr(
/*
* Return a pointer to the n-th key in the btree block.
*/
-STATIC union xfs_btree_key *
+union xfs_btree_key *
xfs_btree_key_addr(
struct xfs_btree_cur *cur,
int n,
@@ -594,7 +592,7 @@ xfs_btree_key_addr(
/*
* Return a pointer to the n-th high key in the btree block.
*/
-STATIC union xfs_btree_key *
+union xfs_btree_key *
xfs_btree_high_key_addr(
struct xfs_btree_cur *cur,
int n,
@@ -607,7 +605,7 @@ xfs_btree_high_key_addr(
/*
* Return a pointer to the n-th block pointer in the btree block.
*/
-STATIC union xfs_btree_ptr *
+union xfs_btree_ptr *
xfs_btree_ptr_addr(
struct xfs_btree_cur *cur,
int n,
@@ -641,7 +639,7 @@ xfs_btree_get_iroot(
* Retrieve the block pointer from the cursor at the given level.
* This may be an inode btree root or from a buffer.
*/
-STATIC struct xfs_btree_block * /* generic btree block pointer */
+struct xfs_btree_block * /* generic btree block pointer */
xfs_btree_get_block(
struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level in btree */
@@ -778,14 +776,14 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- __int64_t fields, /* bitmask of fields */
+ int64_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- __int64_t imask; /* mask for current bit number */
+ int64_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
@@ -1756,7 +1754,7 @@ error0:
return error;
}
-STATIC int
+int
xfs_btree_lookup_get_block(
struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level in the btree */
@@ -1846,7 +1844,7 @@ xfs_btree_lookup(
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* current btree block */
- __int64_t diff; /* difference for the current key */
+ int64_t diff; /* difference for the current key */
int error; /* error return value */
int keyno; /* current key number */
int level; /* level in the btree */
@@ -4395,7 +4393,7 @@ xfs_btree_visit_blocks(
xfs_btree_readahead_ptr(cur, ptr, 1);
/* save for the next iteration of the loop */
- lptr = *ptr;
+ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
}
/* for each buffer in the level */
@@ -4435,7 +4433,7 @@ xfs_btree_visit_blocks(
* recovery completion writes the changes to disk.
*/
struct xfs_btree_block_change_owner_info {
- __uint64_t new_owner;
+ uint64_t new_owner;
struct list_head *buffer_list;
};
@@ -4481,7 +4479,7 @@ xfs_btree_block_change_owner(
int
xfs_btree_change_owner(
struct xfs_btree_cur *cur,
- __uint64_t new_owner,
+ uint64_t new_owner,
struct list_head *buffer_list)
{
struct xfs_btree_block_change_owner_info bbcoi;
@@ -4585,7 +4583,7 @@ xfs_btree_simple_query_range(
{
union xfs_btree_rec *recp;
union xfs_btree_key rec_key;
- __int64_t diff;
+ int64_t diff;
int stat;
bool firstrec = true;
int error;
@@ -4682,8 +4680,8 @@ xfs_btree_overlapped_query_range(
union xfs_btree_key *hkp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- __int64_t ldiff;
- __int64_t hdiff;
+ int64_t ldiff;
+ int64_t hdiff;
int level;
struct xfs_buf *bp;
int i;
@@ -4849,12 +4847,14 @@ xfs_btree_query_all(
xfs_btree_query_range_fn fn,
void *priv)
{
- union xfs_btree_irec low_rec;
- union xfs_btree_irec high_rec;
+ union xfs_btree_key low_key;
+ union xfs_btree_key high_key;
+
+ memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+ memset(&low_key, 0, sizeof(low_key));
+ memset(&high_key, 0xFF, sizeof(high_key));
- memset(&low_rec, 0, sizeof(low_rec));
- memset(&high_rec, 0xFF, sizeof(high_rec));
- return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
+ return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 27bed08261c5..9c95e965cfe5 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -76,7 +76,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
-__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
@@ -150,20 +150,19 @@ struct xfs_btree_ops {
union xfs_btree_rec *rec);
/* difference between key value and cursor value */
- __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+ int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
/*
* Difference between key2 and key1 -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
- __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
union xfs_btree_key *key1,
union xfs_btree_key *key2);
const struct xfs_buf_ops *buf_ops;
-#if defined(DEBUG) || defined(XFS_WARN)
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
@@ -173,7 +172,6 @@ struct xfs_btree_ops {
int (*recs_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_rec *r1,
union xfs_btree_rec *r2);
-#endif
};
/*
@@ -213,11 +211,11 @@ typedef struct xfs_btree_cur
union xfs_btree_irec bc_rec; /* current insert/search record value */
struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
- __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
+ uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
- __uint8_t bc_nlevels; /* number of levels in the tree */
- __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
int bc_statoff; /* offset of btre stats array */
union {
@@ -330,7 +328,7 @@ xfs_btree_islastblock(
*/
void
xfs_btree_offsets(
- __int64_t fields, /* bitmask of fields */
+ int64_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -408,7 +406,7 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
-int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
struct list_head *buffer_list);
/*
@@ -434,7 +432,7 @@ static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
}
static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
- __uint16_t numrecs)
+ uint16_t numrecs)
{
block->bb_numrecs = cpu_to_be16(numrecs);
}
@@ -506,4 +504,17 @@ int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
+ union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
+struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
+ int level, struct xfs_buf **bpp);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index a416c7cb23ea..8211f48b98e6 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -1,7 +1,7 @@
#ifndef _XFS_CKSUM_H
#define _XFS_CKSUM_H 1
-#define XFS_CRC_SEED (~(__uint32_t)0)
+#define XFS_CRC_SEED (~(uint32_t)0)
/*
* Calculate the intermediate checksum for a buffer that has the CRC field
@@ -9,11 +9,11 @@
* cksum_offset parameter. We do not modify the buffer during verification,
* hence we have to split the CRC calculation across the cksum_offset.
*/
-static inline __uint32_t
+static inline uint32_t
xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t zero = 0;
- __uint32_t crc;
+ uint32_t zero = 0;
+ uint32_t crc;
/* Calculate CRC up to the checksum. */
crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
@@ -30,7 +30,7 @@ xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
* Fast CRC method where the buffer is modified. Callers must have exclusive
* access to the buffer while the calculation takes place.
*/
-static inline __uint32_t
+static inline uint32_t
xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
{
/* zero the CRC field */
@@ -48,7 +48,7 @@ xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
* so that it is consistent on disk.
*/
static inline __le32
-xfs_end_cksum(__uint32_t crc)
+xfs_end_cksum(uint32_t crc)
{
return ~cpu_to_le32(crc);
}
@@ -62,7 +62,7 @@ xfs_end_cksum(__uint32_t crc)
static inline void
xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
+ uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
}
@@ -73,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
static inline int
xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
+ uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 1bdf2888295b..6d4335815c3f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -263,7 +263,7 @@ xfs_da3_node_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
which_fork, &xfs_da3_node_buf_ops);
- if (!err && tp) {
+ if (!err && tp && *bpp) {
struct xfs_da_blkinfo *info = (*bpp)->b_addr;
int type;
@@ -1282,7 +1282,7 @@ xfs_da3_fixhashpath(
return;
break;
case XFS_DIR2_LEAFN_MAGIC:
- lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
+ lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
@@ -1502,8 +1502,8 @@ xfs_da3_node_lookup_int(
if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
blk->magic == XFS_DIR3_LEAFN_MAGIC) {
blk->magic = XFS_DIR2_LEAFN_MAGIC;
- blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
- blk->bp, NULL);
+ blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+ blk->bp, NULL);
break;
}
@@ -1929,8 +1929,8 @@ xfs_da3_path_shift(
blk->magic = XFS_DIR2_LEAFN_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
- blk->bp, NULL);
+ blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+ blk->bp, NULL);
break;
default:
ASSERT(0);
@@ -1952,7 +1952,7 @@ xfs_da3_path_shift(
* This is implemented with some source-level loop unrolling.
*/
xfs_dahash_t
-xfs_da_hashname(const __uint8_t *name, int namelen)
+xfs_da_hashname(const uint8_t *name, int namelen)
{
xfs_dahash_t hash;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 4e29cb6a3627..ae6de17467f2 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -60,10 +60,10 @@ enum xfs_dacmp {
*/
typedef struct xfs_da_args {
struct xfs_da_geometry *geo; /* da block geometry */
- const __uint8_t *name; /* string (maybe not NULL terminated) */
+ const uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
- __uint8_t filetype; /* filetype of inode for directories */
- __uint8_t *value; /* set of bytes (maybe contain NULLs) */
+ uint8_t filetype; /* filetype of inode for directories */
+ uint8_t *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
int flags; /* argument flags (eg: ATTR_NOCREATE) */
xfs_dahash_t hashval; /* hash value of name */
@@ -207,7 +207,7 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
-uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+uint xfs_da_hashname(const uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index f1e8d4dbb600..6d77d1a8498a 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -49,7 +49,7 @@ xfs_dir3_sf_entsize(
struct xfs_dir2_sf_hdr *hdr,
int len)
{
- return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+ return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t);
}
static struct xfs_dir2_sf_entry *
@@ -77,7 +77,7 @@ xfs_dir3_sf_nextentry(
* not necessary. For non-filetype enable directories, the type is always
* unknown and we never store the value.
*/
-static __uint8_t
+static uint8_t
xfs_dir2_sfe_get_ftype(
struct xfs_dir2_sf_entry *sfep)
{
@@ -87,16 +87,16 @@ xfs_dir2_sfe_get_ftype(
static void
xfs_dir2_sfe_put_ftype(
struct xfs_dir2_sf_entry *sfep,
- __uint8_t ftype)
+ uint8_t ftype)
{
ASSERT(ftype < XFS_DIR3_FT_MAX);
}
-static __uint8_t
+static uint8_t
xfs_dir3_sfe_get_ftype(
struct xfs_dir2_sf_entry *sfep)
{
- __uint8_t ftype;
+ uint8_t ftype;
ftype = sfep->name[sfep->namelen];
if (ftype >= XFS_DIR3_FT_MAX)
@@ -107,7 +107,7 @@ xfs_dir3_sfe_get_ftype(
static void
xfs_dir3_sfe_put_ftype(
struct xfs_dir2_sf_entry *sfep,
- __uint8_t ftype)
+ uint8_t ftype)
{
ASSERT(ftype < XFS_DIR3_FT_MAX);
@@ -124,7 +124,7 @@ xfs_dir3_sfe_put_ftype(
static xfs_ino_t
xfs_dir2_sf_get_ino(
struct xfs_dir2_sf_hdr *hdr,
- __uint8_t *from)
+ uint8_t *from)
{
if (hdr->i8count)
return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
@@ -135,7 +135,7 @@ xfs_dir2_sf_get_ino(
static void
xfs_dir2_sf_put_ino(
struct xfs_dir2_sf_hdr *hdr,
- __uint8_t *to,
+ uint8_t *to,
xfs_ino_t ino)
{
ASSERT((ino & 0xff00000000000000ULL) == 0);
@@ -225,7 +225,7 @@ xfs_dir3_sfe_put_ino(
#define XFS_DIR3_DATA_ENTSIZE(n) \
round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
- sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \
+ sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \
XFS_DIR2_DATA_ALIGN)
static int
@@ -242,7 +242,7 @@ xfs_dir3_data_entsize(
return XFS_DIR3_DATA_ENTSIZE(n);
}
-static __uint8_t
+static uint8_t
xfs_dir2_data_get_ftype(
struct xfs_dir2_data_entry *dep)
{
@@ -252,16 +252,16 @@ xfs_dir2_data_get_ftype(
static void
xfs_dir2_data_put_ftype(
struct xfs_dir2_data_entry *dep,
- __uint8_t ftype)
+ uint8_t ftype)
{
ASSERT(ftype < XFS_DIR3_FT_MAX);
}
-static __uint8_t
+static uint8_t
xfs_dir3_data_get_ftype(
struct xfs_dir2_data_entry *dep)
{
- __uint8_t ftype = dep->name[dep->namelen];
+ uint8_t ftype = dep->name[dep->namelen];
if (ftype >= XFS_DIR3_FT_MAX)
return XFS_DIR3_FT_UNKNOWN;
@@ -271,7 +271,7 @@ xfs_dir3_data_get_ftype(
static void
xfs_dir3_data_put_ftype(
struct xfs_dir2_data_entry *dep,
- __uint8_t type)
+ uint8_t type)
{
ASSERT(type < XFS_DIR3_FT_MAX);
ASSERT(dep->namelen != 0);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 9a492a9e19bd..3771edcb301d 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -111,11 +111,11 @@ struct xfs_da3_intnode {
* appropriate.
*/
struct xfs_da3_icnode_hdr {
- __uint32_t forw;
- __uint32_t back;
- __uint16_t magic;
- __uint16_t count;
- __uint16_t level;
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t level;
};
/*
@@ -187,14 +187,14 @@ struct xfs_da3_icnode_hdr {
/*
* Byte offset in data block and shortform entry.
*/
-typedef __uint16_t xfs_dir2_data_off_t;
+typedef uint16_t xfs_dir2_data_off_t;
#define NULLDATAOFF 0xffffU
typedef uint xfs_dir2_data_aoff_t; /* argument form */
/*
* Offset in data space of a data entry.
*/
-typedef __uint32_t xfs_dir2_dataptr_t;
+typedef uint32_t xfs_dir2_dataptr_t;
#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
@@ -206,7 +206,7 @@ typedef xfs_off_t xfs_dir2_off_t;
/*
* Directory block number (logical dirblk in file)
*/
-typedef __uint32_t xfs_dir2_db_t;
+typedef uint32_t xfs_dir2_db_t;
#define XFS_INO32_SIZE 4
#define XFS_INO64_SIZE 8
@@ -226,9 +226,9 @@ typedef __uint32_t xfs_dir2_db_t;
* over them.
*/
typedef struct xfs_dir2_sf_hdr {
- __uint8_t count; /* count of entries */
- __uint8_t i8count; /* count of 8-byte inode #s */
- __uint8_t parent[8]; /* parent dir inode number */
+ uint8_t count; /* count of entries */
+ uint8_t i8count; /* count of 8-byte inode #s */
+ uint8_t parent[8]; /* parent dir inode number */
} __packed xfs_dir2_sf_hdr_t;
typedef struct xfs_dir2_sf_entry {
@@ -447,11 +447,11 @@ struct xfs_dir3_leaf_hdr {
};
struct xfs_dir3_icleaf_hdr {
- __uint32_t forw;
- __uint32_t back;
- __uint16_t magic;
- __uint16_t count;
- __uint16_t stale;
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t stale;
};
/*
@@ -538,10 +538,10 @@ struct xfs_dir3_free {
* xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
*/
struct xfs_dir3_icfree_hdr {
- __uint32_t magic;
- __uint32_t firstdb;
- __uint32_t nvalid;
- __uint32_t nused;
+ uint32_t magic;
+ uint32_t firstdb;
+ uint32_t nvalid;
+ uint32_t nused;
};
@@ -632,10 +632,10 @@ typedef struct xfs_attr_shortform {
__u8 padding;
} hdr;
struct xfs_attr_sf_entry {
- __uint8_t namelen; /* actual length of name (no NULL) */
- __uint8_t valuelen; /* actual length of value (no NULL) */
- __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- __uint8_t nameval[1]; /* name & value bytes concatenated */
+ uint8_t namelen; /* actual length of name (no NULL) */
+ uint8_t valuelen; /* actual length of value (no NULL) */
+ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ uint8_t nameval[1]; /* name & value bytes concatenated */
} list[1]; /* variable sized array */
} xfs_attr_shortform_t;
@@ -725,22 +725,22 @@ struct xfs_attr3_leafblock {
* incore, neutral version of the attribute leaf header
*/
struct xfs_attr3_icleaf_hdr {
- __uint32_t forw;
- __uint32_t back;
- __uint16_t magic;
- __uint16_t count;
- __uint16_t usedbytes;
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t usedbytes;
/*
* firstused is 32-bit here instead of 16-bit like the on-disk variant
* to support maximum fsb size of 64k without overflow issues throughout
* the attr code. Instead, the overflow condition is handled on
* conversion to/from disk.
*/
- __uint32_t firstused;
+ uint32_t firstused;
__u8 holes;
struct {
- __uint16_t base;
- __uint16_t size;
+ uint16_t base;
+ uint16_t size;
} freemap[XFS_ATTR_LEAF_MAPSIZE];
};
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 2f389d366e93..ccf9783fd3f0 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -218,8 +218,7 @@ xfs_dir_ino_validate(
agblkno != 0 &&
ioff < (1 << mp->m_sb.sb_inopblog) &&
XFS_AGINO_TO_INO(mp, agno, agino) == ino;
- if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
- XFS_RANDOM_DIR_INO_VALIDATE))) {
+ if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index d6e6d9d16f6c..21c8f8bf94d5 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -47,9 +47,9 @@ struct xfs_dir_ops {
struct xfs_dir2_sf_entry *
(*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep);
- __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+ uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
- __uint8_t ftype);
+ uint8_t ftype);
xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep);
void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
@@ -60,9 +60,9 @@ struct xfs_dir_ops {
xfs_ino_t ino);
int (*data_entsize)(int len);
- __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+ uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
- __uint8_t ftype);
+ uint8_t ftype);
__be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
struct xfs_dir2_data_free *
(*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index aa17cb788946..43c902f7a68d 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -139,7 +139,7 @@ xfs_dir3_block_read(
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index b887fb2a2bcf..27297a689d9c 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -145,7 +145,7 @@ xfs_dir3_leaf_check_int(
static bool
xfs_dir3_leaf_verify(
struct xfs_buf *bp,
- __uint16_t magic)
+ uint16_t magic)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
@@ -154,7 +154,7 @@ xfs_dir3_leaf_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- __uint16_t magic3;
+ uint16_t magic3;
magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
: XFS_DIR3_LEAFN_MAGIC;
@@ -178,7 +178,7 @@ xfs_dir3_leaf_verify(
static void
__read_verify(
struct xfs_buf *bp,
- __uint16_t magic)
+ uint16_t magic)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
@@ -195,7 +195,7 @@ __read_verify(
static void
__write_verify(
struct xfs_buf *bp,
- __uint16_t magic)
+ uint16_t magic)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_fspriv;
@@ -256,7 +256,7 @@ const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.verify_write = xfs_dir3_leafn_write_verify,
};
-static int
+int
xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
return err;
}
@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
return err;
}
@@ -299,7 +299,7 @@ xfs_dir3_leaf_init(
struct xfs_trans *tp,
struct xfs_buf *bp,
xfs_ino_t owner,
- __uint16_t type)
+ uint16_t type)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
@@ -343,7 +343,7 @@ xfs_dir3_leaf_get_buf(
xfs_da_args_t *args,
xfs_dir2_db_t bno,
struct xfs_buf **bpp,
- __uint16_t magic)
+ uint16_t magic)
{
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index bbd1238852b3..682e2bf370c7 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -528,7 +528,7 @@ xfs_dir2_free_hdr_check(
* Stale entries are ok.
*/
xfs_dahash_t /* hash value */
-xfs_dir2_leafn_lasthash(
+xfs_dir2_leaf_lasthash(
struct xfs_inode *dp,
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
@@ -540,7 +540,9 @@ xfs_dir2_leafn_lasthash(
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
- leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
if (count)
*count = leafhdr.count;
@@ -1405,8 +1407,8 @@ xfs_dir2_leafn_split(
/*
* Update last hashval in each block since we added the name.
*/
- oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
- newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+ oldblk->hashval = xfs_dir2_leaf_lasthash(dp, oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leaf_lasthash(dp, newblk->bp, NULL);
xfs_dir3_leaf_check(dp, oldblk->bp);
xfs_dir3_leaf_check(dp, newblk->bp);
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 39f8604f764e..4badd26c47e6 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -58,6 +58,8 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
struct xfs_buf **bpp);
/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
@@ -69,7 +71,7 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
struct xfs_dir2_leaf_entry *ents, int *indexp,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
- struct xfs_buf **bpp, __uint16_t magic);
+ struct xfs_buf **bpp, uint16_t magic);
extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
struct xfs_buf *bp, int first, int last);
extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
@@ -93,7 +95,7 @@ extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
/* xfs_dir2_node.c */
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp,
struct xfs_buf *bp, int *count);
extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
struct xfs_da_args *args, int *indexp,
@@ -128,7 +130,7 @@ extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
extern int xfs_dir2_sf_verify(struct xfs_inode *ip);
/* xfs_dir2_readdir.c */
-extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
- size_t bufsize);
+extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct dir_context *ctx, size_t bufsize);
#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e84af093b2ab..be8b9755f66a 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -647,7 +647,7 @@ xfs_dir2_sf_verify(
int offset;
int size;
int error;
- __uint8_t filetype;
+ uint8_t filetype;
ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index a1dccd8d96bc..23229f0c5b15 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -103,8 +103,8 @@ struct xfs_ifork;
* Must be padded to 64 bit alignment.
*/
typedef struct xfs_sb {
- __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
- __uint32_t sb_blocksize; /* logical block size, bytes */
+ uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ uint32_t sb_blocksize; /* logical block size, bytes */
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
xfs_rtblock_t sb_rextents; /* number of realtime extents */
@@ -118,45 +118,45 @@ typedef struct xfs_sb {
xfs_agnumber_t sb_agcount; /* number of allocation groups */
xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
xfs_extlen_t sb_logblocks; /* number of log blocks */
- __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
- __uint16_t sb_sectsize; /* volume sector size, bytes */
- __uint16_t sb_inodesize; /* inode size, bytes */
- __uint16_t sb_inopblock; /* inodes per block */
+ uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
+ uint16_t sb_sectsize; /* volume sector size, bytes */
+ uint16_t sb_inodesize; /* inode size, bytes */
+ uint16_t sb_inopblock; /* inodes per block */
char sb_fname[12]; /* file system name */
- __uint8_t sb_blocklog; /* log2 of sb_blocksize */
- __uint8_t sb_sectlog; /* log2 of sb_sectsize */
- __uint8_t sb_inodelog; /* log2 of sb_inodesize */
- __uint8_t sb_inopblog; /* log2 of sb_inopblock */
- __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
- __uint8_t sb_rextslog; /* log2 of sb_rextents */
- __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
- __uint8_t sb_imax_pct; /* max % of fs for inode space */
+ uint8_t sb_blocklog; /* log2 of sb_blocksize */
+ uint8_t sb_sectlog; /* log2 of sb_sectsize */
+ uint8_t sb_inodelog; /* log2 of sb_inodesize */
+ uint8_t sb_inopblog; /* log2 of sb_inopblock */
+ uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ uint8_t sb_rextslog; /* log2 of sb_rextents */
+ uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
+ uint8_t sb_imax_pct; /* max % of fs for inode space */
/* statistics */
/*
* These fields must remain contiguous. If you really
* want to change their layout, make sure you fix the
* code in xfs_trans_apply_sb_deltas().
*/
- __uint64_t sb_icount; /* allocated inodes */
- __uint64_t sb_ifree; /* free inodes */
- __uint64_t sb_fdblocks; /* free data blocks */
- __uint64_t sb_frextents; /* free realtime extents */
+ uint64_t sb_icount; /* allocated inodes */
+ uint64_t sb_ifree; /* free inodes */
+ uint64_t sb_fdblocks; /* free data blocks */
+ uint64_t sb_frextents; /* free realtime extents */
/*
* End contiguous fields.
*/
xfs_ino_t sb_uquotino; /* user quota inode */
xfs_ino_t sb_gquotino; /* group quota inode */
- __uint16_t sb_qflags; /* quota flags */
- __uint8_t sb_flags; /* misc. flags */
- __uint8_t sb_shared_vn; /* shared version number */
+ uint16_t sb_qflags; /* quota flags */
+ uint8_t sb_flags; /* misc. flags */
+ uint8_t sb_shared_vn; /* shared version number */
xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
- __uint32_t sb_unit; /* stripe or raid unit */
- __uint32_t sb_width; /* stripe or raid width */
- __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
- __uint8_t sb_logsectlog; /* log2 of the log sector size */
- __uint16_t sb_logsectsize; /* sector size for the log, bytes */
- __uint32_t sb_logsunit; /* stripe unit size for the log */
- __uint32_t sb_features2; /* additional feature bits */
+ uint32_t sb_unit; /* stripe or raid unit */
+ uint32_t sb_width; /* stripe or raid width */
+ uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
+ uint8_t sb_logsectlog; /* log2 of the log sector size */
+ uint16_t sb_logsectsize; /* sector size for the log, bytes */
+ uint32_t sb_logsunit; /* stripe unit size for the log */
+ uint32_t sb_features2; /* additional feature bits */
/*
* bad features2 field as a result of failing to pad the sb structure to
@@ -167,17 +167,17 @@ typedef struct xfs_sb {
* the value in sb_features2 when formatting the incore superblock to
* the disk buffer.
*/
- __uint32_t sb_bad_features2;
+ uint32_t sb_bad_features2;
/* version 5 superblock fields start here */
/* feature masks */
- __uint32_t sb_features_compat;
- __uint32_t sb_features_ro_compat;
- __uint32_t sb_features_incompat;
- __uint32_t sb_features_log_incompat;
+ uint32_t sb_features_compat;
+ uint32_t sb_features_ro_compat;
+ uint32_t sb_features_incompat;
+ uint32_t sb_features_log_incompat;
- __uint32_t sb_crc; /* superblock crc */
+ uint32_t sb_crc; /* superblock crc */
xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
xfs_ino_t sb_pquotino; /* project quota inode */
@@ -449,7 +449,7 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
static inline bool
xfs_sb_has_compat_feature(
struct xfs_sb *sbp,
- __uint32_t feature)
+ uint32_t feature)
{
return (sbp->sb_features_compat & feature) != 0;
}
@@ -465,7 +465,7 @@ xfs_sb_has_compat_feature(
static inline bool
xfs_sb_has_ro_compat_feature(
struct xfs_sb *sbp,
- __uint32_t feature)
+ uint32_t feature)
{
return (sbp->sb_features_ro_compat & feature) != 0;
}
@@ -482,7 +482,7 @@ xfs_sb_has_ro_compat_feature(
static inline bool
xfs_sb_has_incompat_feature(
struct xfs_sb *sbp,
- __uint32_t feature)
+ uint32_t feature)
{
return (sbp->sb_features_incompat & feature) != 0;
}
@@ -492,7 +492,7 @@ xfs_sb_has_incompat_feature(
static inline bool
xfs_sb_has_incompat_log_feature(
struct xfs_sb *sbp,
- __uint32_t feature)
+ uint32_t feature)
{
return (sbp->sb_features_log_incompat & feature) != 0;
}
@@ -594,8 +594,8 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
*/
#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
#define XFS_B_TO_FSB(mp,b) \
- ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+ ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
/*
@@ -1072,7 +1072,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* next agno_log bits - ag number
* high agno_log-agblklog-inopblog bits - 0
*/
-#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1)
#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
@@ -1211,6 +1211,7 @@ struct xfs_dsymlink_hdr {
#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
+#define XFS_SYMLINK_MAXLEN 1024
/*
* The maximum pathlen is 1024 bytes. Since the minimum file system
* blocksize is 512 bytes, we can get a max of 3 extents back from
@@ -1269,16 +1270,16 @@ typedef __be32 xfs_alloc_ptr_t;
#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
-typedef __uint64_t xfs_inofree_t;
+typedef uint64_t xfs_inofree_t;
#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
-#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
+#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(uint16_t))
#define XFS_INODES_PER_HOLEMASK_BIT \
- (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+ (XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t)))
static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
@@ -1312,9 +1313,9 @@ typedef struct xfs_inobt_rec {
typedef struct xfs_inobt_rec_incore {
xfs_agino_t ir_startino; /* starting inode number */
- __uint16_t ir_holemask; /* hole mask for sparse chunks */
- __uint8_t ir_count; /* total inode count */
- __uint8_t ir_freecount; /* count of free inodes (set bits) */
+ uint16_t ir_holemask; /* hole mask for sparse chunks */
+ uint8_t ir_count; /* total inode count */
+ uint8_t ir_freecount; /* count of free inodes (set bits) */
xfs_inofree_t ir_free; /* free inode mask */
} xfs_inobt_rec_incore_t;
@@ -1397,15 +1398,15 @@ struct xfs_rmap_rec {
* rm_offset:54-60 aren't used and should be zero
* rm_offset:0-53 is the block offset within the inode
*/
-#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63)
-#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62)
-#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61)
+#define XFS_RMAP_OFF_ATTR_FORK ((uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK ((uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN ((uint64_t)1ULL << 61)
-#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U)
+#define XFS_RMAP_LEN_MAX ((uint32_t)~0U)
#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \
XFS_RMAP_OFF_BMBT_BLOCK | \
XFS_RMAP_OFF_UNWRITTEN)
-#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL)
+#define XFS_RMAP_OFF_MASK ((uint64_t)0x3FFFFFFFFFFFFFULL)
#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK)
@@ -1431,8 +1432,8 @@ struct xfs_rmap_rec {
struct xfs_rmap_irec {
xfs_agblock_t rm_startblock; /* extent start block */
xfs_extlen_t rm_blockcount; /* extent length */
- __uint64_t rm_owner; /* extent owner */
- __uint64_t rm_offset; /* offset within the owner */
+ uint64_t rm_owner; /* extent owner */
+ uint64_t rm_offset; /* offset within the owner */
unsigned int rm_flags; /* state flags */
};
@@ -1544,11 +1545,11 @@ typedef struct xfs_bmbt_rec {
__be64 l0, l1;
} xfs_bmbt_rec_t;
-typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
typedef struct xfs_bmbt_rec_host {
- __uint64_t l0, l1;
+ uint64_t l0, l1;
} xfs_bmbt_rec_host_t;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 095bdf049a3f..8c61f21535d4 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -302,10 +302,10 @@ typedef struct xfs_bstat {
* and using two 16bit values to hold new 32bit projid was choosen
* to retain compatibility with "old" filesystems).
*/
-static inline __uint32_t
+static inline uint32_t
bstat_get_projid(struct xfs_bstat *bs)
{
- return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+ return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
}
/*
@@ -446,19 +446,15 @@ typedef struct xfs_handle {
} xfs_handle_t;
#define ha_fsid ha_u._ha_fsid
-#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \
- - (char *) &(handle)) \
- + (handle).ha_fid.fid_len)
-
/*
* Structure passed to XFS_IOC_SWAPEXT
*/
typedef struct xfs_swapext
{
- __int64_t sx_version; /* version */
+ int64_t sx_version; /* version */
#define XFS_SX_VERSION 0
- __int64_t sx_fdtarget; /* fd of target file */
- __int64_t sx_fdtmp; /* fd of tmp file */
+ int64_t sx_fdtarget; /* fd of target file */
+ int64_t sx_fdtmp; /* fd of tmp file */
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
@@ -546,7 +542,7 @@ typedef struct xfs_swapext
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
-#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t)
+#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d41ade5d293e..ffd5a15d1bb6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -46,7 +46,7 @@
/*
* Allocation group level functions.
*/
-static inline int
+int
xfs_ialloc_cluster_alignment(
struct xfs_mount *mp)
{
@@ -98,24 +98,15 @@ xfs_inobt_update(
return xfs_btree_update(cur, &rec);
}
-/*
- * Get the data from the pointed-to record.
- */
-int /* error */
-xfs_inobt_get_rec(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_inobt_rec_incore_t *irec, /* btree record */
- int *stat) /* output: success/failure */
+/* Convert on-disk btree record to incore inobt record. */
+void
+xfs_inobt_btrec_to_irec(
+ struct xfs_mount *mp,
+ union xfs_btree_rec *rec,
+ struct xfs_inobt_rec_incore *irec)
{
- union xfs_btree_rec *rec;
- int error;
-
- error = xfs_btree_get_rec(cur, &rec, stat);
- if (error || *stat == 0)
- return error;
-
irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb)) {
irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
irec->ir_count = rec->inobt.ir_u.sp.ir_count;
irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
@@ -130,6 +121,25 @@ xfs_inobt_get_rec(
be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
}
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_inobt_btrec_to_irec(cur->bc_mp, rec, irec);
return 0;
}
@@ -140,9 +150,9 @@ xfs_inobt_get_rec(
STATIC int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
- __uint16_t holemask,
- __uint8_t count,
- __int32_t freecount,
+ uint16_t holemask,
+ uint8_t count,
+ int32_t freecount,
xfs_inofree_t free,
int *stat)
{
@@ -2542,8 +2552,7 @@ xfs_agi_read_verify(
!xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
xfs_buf_ioerror(bp, -EFSBADCRC);
else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
- XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))
+ XFS_ERRTAG_IALLOC_READ_AGI))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 0bb89669fc07..b32cfb5aeb5b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -168,5 +168,10 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+union xfs_btree_rec;
+void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec,
+ struct xfs_inobt_rec_incore *irec);
+
+int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 7c471881c9a6..317caba9faa6 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -175,6 +175,18 @@ xfs_inobt_init_key_from_rec(
}
STATIC void
+xfs_inobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->inobt.ir_startino);
+ x += XFS_INODES_PER_CHUNK - 1;
+ key->inobt.ir_startino = cpu_to_be32(x);
+}
+
+STATIC void
xfs_inobt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -219,15 +231,25 @@ xfs_finobt_init_ptr_from_cur(
ptr->s = agi->agi_free_root;
}
-STATIC __int64_t
+STATIC int64_t
xfs_inobt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
- return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+ return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
cur->bc_rec.i.ir_startino;
}
+STATIC int64_t
+xfs_inobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
+ be32_to_cpu(k2->inobt.ir_startino);
+}
+
static int
xfs_inobt_verify(
struct xfs_buf *bp)
@@ -302,7 +324,6 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = {
.verify_write = xfs_inobt_write_verify,
};
-#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_inobt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -322,7 +343,6 @@ xfs_inobt_recs_inorder(
return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
be32_to_cpu(r2->inobt.ir_startino);
}
-#endif /* DEBUG */
static const struct xfs_btree_ops xfs_inobt_ops = {
.rec_len = sizeof(xfs_inobt_rec_t),
@@ -335,14 +355,14 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
.buf_ops = &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
+ .diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
-#endif
};
static const struct xfs_btree_ops xfs_finobt_ops = {
@@ -356,14 +376,14 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
.buf_ops = &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
+ .diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
-#endif
};
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09c3d1aecef2..378f8fbc91a7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -105,8 +105,7 @@ xfs_inode_buf_verify(
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
xfs_dinode_good_version(mp, dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
+ XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
xfs_buf_ioerror(bp, -EIO);
@@ -381,7 +380,7 @@ xfs_log_dinode_to_disk(
}
}
-static bool
+bool
xfs_dinode_verify(
struct xfs_mount *mp,
xfs_ino_t ino,
@@ -444,7 +443,7 @@ xfs_dinode_calc_crc(
struct xfs_mount *mp,
struct xfs_dinode *dip)
{
- __uint32_t crc;
+ uint32_t crc;
if (dip->di_version < 3)
return;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 6848a0afbce7..a9c97a356c30 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -28,26 +28,26 @@ struct xfs_dinode;
* format specific structures at the appropriate time.
*/
struct xfs_icdinode {
- __int8_t di_version; /* inode version */
- __int8_t di_format; /* format of di_c data */
- __uint16_t di_flushiter; /* incremented on flush */
- __uint32_t di_uid; /* owner's user id */
- __uint32_t di_gid; /* owner's group id */
- __uint16_t di_projid_lo; /* lower part of owner's project id */
- __uint16_t di_projid_hi; /* higher part of owner's project id */
+ int8_t di_version; /* inode version */
+ int8_t di_format; /* format of di_c data */
+ uint16_t di_flushiter; /* incremented on flush */
+ uint32_t di_uid; /* owner's user id */
+ uint32_t di_gid; /* owner's group id */
+ uint16_t di_projid_lo; /* lower part of owner's project id */
+ uint16_t di_projid_hi; /* higher part of owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
xfs_extnum_t di_nextents; /* number of extents in data fork */
xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
- __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- __int8_t di_aformat; /* format of attr fork's data */
- __uint32_t di_dmevmask; /* DMIG event mask */
- __uint16_t di_dmstate; /* DMIG state info */
- __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ int8_t di_aformat; /* format of attr fork's data */
+ uint32_t di_dmevmask; /* DMIG event mask */
+ uint16_t di_dmstate; /* DMIG state info */
+ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
- __uint64_t di_flags2; /* more random flags */
- __uint32_t di_cowextsize; /* basic cow extent size for file */
+ uint64_t di_flags2; /* more random flags */
+ uint32_t di_cowextsize; /* basic cow extent size for file */
xfs_ictimestamp_t di_crtime; /* time created */
};
@@ -82,4 +82,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
+bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
+ struct xfs_dinode *dip);
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 7ae571f8e34a..8372e9bcd7b6 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -31,7 +31,7 @@ struct xfs_trans_res;
* through all the log items definitions and everything they encode into the
* log.
*/
-typedef __uint32_t xlog_tid_t;
+typedef uint32_t xlog_tid_t;
#define XLOG_MIN_ICLOGS 2
#define XLOG_MAX_ICLOGS 8
@@ -211,7 +211,7 @@ typedef struct xfs_log_iovec {
typedef struct xfs_trans_header {
uint th_magic; /* magic number */
uint th_type; /* transaction type */
- __int32_t th_tid; /* transaction id (unused) */
+ int32_t th_tid; /* transaction id (unused) */
uint th_num_items; /* num items logged by trans */
} xfs_trans_header_t;
@@ -265,52 +265,52 @@ typedef struct xfs_trans_header {
* must be added on to the end.
*/
typedef struct xfs_inode_log_format {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint64_t ilf_ino; /* inode number */
+ uint16_t ilf_type; /* inode log item type */
+ uint16_t ilf_size; /* size of this item */
+ uint32_t ilf_fields; /* flags for fields logged */
+ uint16_t ilf_asize; /* size of attr d/ext/root */
+ uint16_t ilf_dsize; /* size of data/ext/root */
+ uint64_t ilf_ino; /* inode number */
union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uint32_t ilfu_rdev; /* rdev value for dev inode*/
uuid_t ilfu_uuid; /* mount point value */
} ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
+ int64_t ilf_blkno; /* blkno of inode buffer */
+ int32_t ilf_len; /* len of inode buffer */
+ int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_t;
typedef struct xfs_inode_log_format_32 {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint64_t ilf_ino; /* inode number */
+ uint16_t ilf_type; /* inode log item type */
+ uint16_t ilf_size; /* size of this item */
+ uint32_t ilf_fields; /* flags for fields logged */
+ uint16_t ilf_asize; /* size of attr d/ext/root */
+ uint16_t ilf_dsize; /* size of data/ext/root */
+ uint64_t ilf_ino; /* inode number */
union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uint32_t ilfu_rdev; /* rdev value for dev inode*/
uuid_t ilfu_uuid; /* mount point value */
} ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
+ int64_t ilf_blkno; /* blkno of inode buffer */
+ int32_t ilf_len; /* len of inode buffer */
+ int32_t ilf_boffset; /* off of inode in buffer */
} __attribute__((packed)) xfs_inode_log_format_32_t;
typedef struct xfs_inode_log_format_64 {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint32_t ilf_pad; /* pad for 64 bit boundary */
- __uint64_t ilf_ino; /* inode number */
+ uint16_t ilf_type; /* inode log item type */
+ uint16_t ilf_size; /* size of this item */
+ uint32_t ilf_fields; /* flags for fields logged */
+ uint16_t ilf_asize; /* size of attr d/ext/root */
+ uint16_t ilf_dsize; /* size of data/ext/root */
+ uint32_t ilf_pad; /* pad for 64 bit boundary */
+ uint64_t ilf_ino; /* inode number */
union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uint32_t ilfu_rdev; /* rdev value for dev inode*/
uuid_t ilfu_uuid; /* mount point value */
} ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
+ int64_t ilf_blkno; /* blkno of inode buffer */
+ int32_t ilf_len; /* len of inode buffer */
+ int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_64_t;
@@ -379,8 +379,8 @@ static inline int xfs_ilog_fdata(int w)
* information.
*/
typedef struct xfs_ictimestamp {
- __int32_t t_sec; /* timestamp seconds */
- __int32_t t_nsec; /* timestamp nanoseconds */
+ int32_t t_sec; /* timestamp seconds */
+ int32_t t_nsec; /* timestamp nanoseconds */
} xfs_ictimestamp_t;
/*
@@ -388,18 +388,18 @@ typedef struct xfs_ictimestamp {
* kept identical to struct xfs_dinode except for the endianness annotations.
*/
struct xfs_log_dinode {
- __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
- __uint16_t di_mode; /* mode and type of file */
- __int8_t di_version; /* inode version */
- __int8_t di_format; /* format of di_c data */
- __uint8_t di_pad3[2]; /* unused in v2/3 inodes */
- __uint32_t di_uid; /* owner's user id */
- __uint32_t di_gid; /* owner's group id */
- __uint32_t di_nlink; /* number of links to file */
- __uint16_t di_projid_lo; /* lower part of owner's project id */
- __uint16_t di_projid_hi; /* higher part of owner's project id */
- __uint8_t di_pad[6]; /* unused, zeroed space */
- __uint16_t di_flushiter; /* incremented on flush */
+ uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ uint16_t di_mode; /* mode and type of file */
+ int8_t di_version; /* inode version */
+ int8_t di_format; /* format of di_c data */
+ uint8_t di_pad3[2]; /* unused in v2/3 inodes */
+ uint32_t di_uid; /* owner's user id */
+ uint32_t di_gid; /* owner's group id */
+ uint32_t di_nlink; /* number of links to file */
+ uint16_t di_projid_lo; /* lower part of owner's project id */
+ uint16_t di_projid_hi; /* higher part of owner's project id */
+ uint8_t di_pad[6]; /* unused, zeroed space */
+ uint16_t di_flushiter; /* incremented on flush */
xfs_ictimestamp_t di_atime; /* time last accessed */
xfs_ictimestamp_t di_mtime; /* time last modified */
xfs_ictimestamp_t di_ctime; /* time created/inode modified */
@@ -408,23 +408,23 @@ struct xfs_log_dinode {
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
xfs_extnum_t di_nextents; /* number of extents in data fork */
xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
- __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- __int8_t di_aformat; /* format of attr fork's data */
- __uint32_t di_dmevmask; /* DMIG event mask */
- __uint16_t di_dmstate; /* DMIG state info */
- __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
- __uint32_t di_gen; /* generation number */
+ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ int8_t di_aformat; /* format of attr fork's data */
+ uint32_t di_dmevmask; /* DMIG event mask */
+ uint16_t di_dmstate; /* DMIG state info */
+ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+ uint32_t di_gen; /* generation number */
/* di_next_unlinked is the only non-core field in the old dinode */
xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
/* start of the extended dinode, writable fields */
- __uint32_t di_crc; /* CRC of the inode */
- __uint64_t di_changecount; /* number of attribute changes */
+ uint32_t di_crc; /* CRC of the inode */
+ uint64_t di_changecount; /* number of attribute changes */
xfs_lsn_t di_lsn; /* flush sequence */
- __uint64_t di_flags2; /* more random flags */
- __uint32_t di_cowextsize; /* basic cow extent size for file */
- __uint8_t di_pad2[12]; /* more padding for future expansion */
+ uint64_t di_flags2; /* more random flags */
+ uint32_t di_cowextsize; /* basic cow extent size for file */
+ uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_ictimestamp_t di_crtime; /* time created */
@@ -483,7 +483,7 @@ typedef struct xfs_buf_log_format {
unsigned short blf_size; /* size of this item */
unsigned short blf_flags; /* misc state */
unsigned short blf_len; /* number of blocks in this buf */
- __int64_t blf_blkno; /* starting blkno of this buf */
+ int64_t blf_blkno; /* starting blkno of this buf */
unsigned int blf_map_size; /* used size of data bitmap in words */
unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
} xfs_buf_log_format_t;
@@ -533,7 +533,7 @@ xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
}
-static inline __uint16_t
+static inline uint16_t
xfs_blft_from_flags(struct xfs_buf_log_format *blf)
{
return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
@@ -554,14 +554,14 @@ typedef struct xfs_extent {
* conversion routine.
*/
typedef struct xfs_extent_32 {
- __uint64_t ext_start;
- __uint32_t ext_len;
+ uint64_t ext_start;
+ uint32_t ext_len;
} __attribute__((packed)) xfs_extent_32_t;
typedef struct xfs_extent_64 {
- __uint64_t ext_start;
- __uint32_t ext_len;
- __uint32_t ext_pad;
+ uint64_t ext_start;
+ uint32_t ext_len;
+ uint32_t ext_pad;
} xfs_extent_64_t;
/*
@@ -570,26 +570,26 @@ typedef struct xfs_extent_64 {
* size is given by efi_nextents.
*/
typedef struct xfs_efi_log_format {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
xfs_extent_t efi_extents[1]; /* array of extents to free */
} xfs_efi_log_format_t;
typedef struct xfs_efi_log_format_32 {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
xfs_extent_32_t efi_extents[1]; /* array of extents to free */
} __attribute__((packed)) xfs_efi_log_format_32_t;
typedef struct xfs_efi_log_format_64 {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
xfs_extent_64_t efi_extents[1]; /* array of extents to free */
} xfs_efi_log_format_64_t;
@@ -599,26 +599,26 @@ typedef struct xfs_efi_log_format_64 {
* size is given by efd_nextents;
*/
typedef struct xfs_efd_log_format {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
xfs_extent_t efd_extents[1]; /* array of extents freed */
} xfs_efd_log_format_t;
typedef struct xfs_efd_log_format_32 {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
xfs_extent_32_t efd_extents[1]; /* array of extents freed */
} __attribute__((packed)) xfs_efd_log_format_32_t;
typedef struct xfs_efd_log_format_64 {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
xfs_extent_64_t efd_extents[1]; /* array of extents freed */
} xfs_efd_log_format_64_t;
@@ -626,11 +626,11 @@ typedef struct xfs_efd_log_format_64 {
* RUI/RUD (reverse mapping) log format definitions
*/
struct xfs_map_extent {
- __uint64_t me_owner;
- __uint64_t me_startblock;
- __uint64_t me_startoff;
- __uint32_t me_len;
- __uint32_t me_flags;
+ uint64_t me_owner;
+ uint64_t me_startblock;
+ uint64_t me_startoff;
+ uint32_t me_len;
+ uint32_t me_flags;
};
/* rmap me_flags: upper bits are flags, lower byte is type code */
@@ -659,10 +659,10 @@ struct xfs_map_extent {
* size is given by rui_nextents.
*/
struct xfs_rui_log_format {
- __uint16_t rui_type; /* rui log item type */
- __uint16_t rui_size; /* size of this item */
- __uint32_t rui_nextents; /* # extents to free */
- __uint64_t rui_id; /* rui identifier */
+ uint16_t rui_type; /* rui log item type */
+ uint16_t rui_size; /* size of this item */
+ uint32_t rui_nextents; /* # extents to free */
+ uint64_t rui_id; /* rui identifier */
struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
};
@@ -680,19 +680,19 @@ xfs_rui_log_format_sizeof(
* size is given by rud_nextents;
*/
struct xfs_rud_log_format {
- __uint16_t rud_type; /* rud log item type */
- __uint16_t rud_size; /* size of this item */
- __uint32_t __pad;
- __uint64_t rud_rui_id; /* id of corresponding rui */
+ uint16_t rud_type; /* rud log item type */
+ uint16_t rud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t rud_rui_id; /* id of corresponding rui */
};
/*
* CUI/CUD (refcount update) log format definitions
*/
struct xfs_phys_extent {
- __uint64_t pe_startblock;
- __uint32_t pe_len;
- __uint32_t pe_flags;
+ uint64_t pe_startblock;
+ uint32_t pe_len;
+ uint32_t pe_flags;
};
/* refcount pe_flags: upper bits are flags, lower byte is type code */
@@ -707,10 +707,10 @@ struct xfs_phys_extent {
* size is given by cui_nextents.
*/
struct xfs_cui_log_format {
- __uint16_t cui_type; /* cui log item type */
- __uint16_t cui_size; /* size of this item */
- __uint32_t cui_nextents; /* # extents to free */
- __uint64_t cui_id; /* cui identifier */
+ uint16_t cui_type; /* cui log item type */
+ uint16_t cui_size; /* size of this item */
+ uint32_t cui_nextents; /* # extents to free */
+ uint64_t cui_id; /* cui identifier */
struct xfs_phys_extent cui_extents[]; /* array of extents */
};
@@ -728,10 +728,10 @@ xfs_cui_log_format_sizeof(
* size is given by cud_nextents;
*/
struct xfs_cud_log_format {
- __uint16_t cud_type; /* cud log item type */
- __uint16_t cud_size; /* size of this item */
- __uint32_t __pad;
- __uint64_t cud_cui_id; /* id of corresponding cui */
+ uint16_t cud_type; /* cud log item type */
+ uint16_t cud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t cud_cui_id; /* id of corresponding cui */
};
/*
@@ -755,10 +755,10 @@ struct xfs_cud_log_format {
* size is given by bui_nextents.
*/
struct xfs_bui_log_format {
- __uint16_t bui_type; /* bui log item type */
- __uint16_t bui_size; /* size of this item */
- __uint32_t bui_nextents; /* # extents to free */
- __uint64_t bui_id; /* bui identifier */
+ uint16_t bui_type; /* bui log item type */
+ uint16_t bui_size; /* size of this item */
+ uint32_t bui_nextents; /* # extents to free */
+ uint64_t bui_id; /* bui identifier */
struct xfs_map_extent bui_extents[]; /* array of extents to bmap */
};
@@ -776,10 +776,10 @@ xfs_bui_log_format_sizeof(
* size is given by bud_nextents;
*/
struct xfs_bud_log_format {
- __uint16_t bud_type; /* bud log item type */
- __uint16_t bud_size; /* size of this item */
- __uint32_t __pad;
- __uint64_t bud_bui_id; /* id of corresponding bui */
+ uint16_t bud_type; /* bud log item type */
+ uint16_t bud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t bud_bui_id; /* id of corresponding bui */
};
/*
@@ -789,12 +789,12 @@ struct xfs_bud_log_format {
* 32 bits : log_recovery code assumes that.
*/
typedef struct xfs_dq_logformat {
- __uint16_t qlf_type; /* dquot log item type */
- __uint16_t qlf_size; /* size of this item */
+ uint16_t qlf_type; /* dquot log item type */
+ uint16_t qlf_size; /* size of this item */
xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
- __int64_t qlf_blkno; /* blkno of dquot buffer */
- __int32_t qlf_len; /* len of dquot buffer */
- __uint32_t qlf_boffset; /* off of dquot in buffer */
+ int64_t qlf_blkno; /* blkno of dquot buffer */
+ int32_t qlf_len; /* len of dquot buffer */
+ uint32_t qlf_boffset; /* off of dquot in buffer */
} xfs_dq_logformat_t;
/*
@@ -853,8 +853,8 @@ typedef struct xfs_qoff_logformat {
* decoding can be done correctly.
*/
struct xfs_icreate_log {
- __uint16_t icl_type; /* type of log format structure */
- __uint16_t icl_size; /* size of log format structure */
+ uint16_t icl_type; /* type of log format structure */
+ uint16_t icl_size; /* size of log format structure */
__be32 icl_ag; /* ag being allocated in */
__be32 icl_agbno; /* start block of inode range */
__be32 icl_count; /* number of inodes to initialise */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 29a01ec89dd0..66948a9fd486 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -26,7 +26,7 @@
#define XLOG_RHASH_SIZE 16
#define XLOG_RHASH_SHIFT 2
#define XLOG_RHASH(tid) \
- ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+ ((((uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 8eed51275bb3..2834574cb6e7 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -27,8 +27,8 @@
* they may need 64-bit accounting. Hence, 64-bit quota-counters,
* and quota-limits. This is a waste in the common case, but hey ...
*/
-typedef __uint64_t xfs_qcnt_t;
-typedef __uint16_t xfs_qwarncnt_t;
+typedef uint64_t xfs_qcnt_t;
+typedef uint16_t xfs_qwarncnt_t;
/*
* flags for q_flags field in the dquot.
@@ -136,6 +136,8 @@ typedef __uint16_t xfs_qwarncnt_t;
*/
#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_NOLOCK 0x2000000 /* don't ilock during dqget */
+
/*
* flags to xfs_trans_mod_dquot.
*/
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index b177ef33cd4c..900ea231f9a3 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
}
/*
- * While we're adjusting the refcounts records of an extent, we have
- * to keep an eye on the number of extents we're dirtying -- run too
- * many in a single transaction and we'll exceed the transaction's
- * reservation and crash the fs. Each record adds 12 bytes to the
- * log (plus any key updates) so we'll conservatively assume 24 bytes
- * per record. We must also leave space for btree splits on both ends
- * of the range and space for the CUD and a new CUI.
- *
* XXX: This is a pretty hand-wavy estimate. The penalty for guessing
* true incorrectly is a shutdown FS; the penalty for guessing false
* incorrectly is more transaction rolls than might be necessary.
@@ -813,8 +805,7 @@ xfs_refcount_still_have_space(
*/
if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
- XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE,
- XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE))
+ XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
if (cur->bc_private.a.priv.refc.nr_ops == 0)
@@ -822,7 +813,7 @@ xfs_refcount_still_have_space(
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
- cur->bc_private.a.priv.refc.nr_ops * 32;
+ cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1076,8 +1067,7 @@ xfs_refcount_finish_one(
blockcount);
if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_REFCOUNT_FINISH_ONE,
- XFS_RANDOM_REFCOUNT_FINISH_ONE))
+ XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
/*
@@ -1629,13 +1619,28 @@ xfs_refcount_recover_cow_leftovers(
if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
return -EOPNOTSUPP;
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ INIT_LIST_HEAD(&debris);
+
+ /*
+ * In this first part, we use an empty transaction to gather up
+ * all the leftover CoW extents so that we can subsequently
+ * delete them. The empty transaction is used to avoid
+ * a buffer lock deadlock if there happens to be a loop in the
+ * refcountbt because we're allowed to re-grab a buffer that is
+ * already attached to our transaction. When we're done
+ * recording the CoW debris we cancel the (empty) transaction
+ * and everything goes away cleanly.
+ */
+ error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (error)
+ goto out_trans;
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
/* Find all the leftover CoW staging extents. */
- INIT_LIST_HEAD(&debris);
memset(&low, 0, sizeof(low));
memset(&high, 0, sizeof(high));
low.rc.rc_startblock = XFS_REFC_COW_START;
@@ -1645,10 +1650,11 @@ xfs_refcount_recover_cow_leftovers(
if (error)
goto out_cursor;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- xfs_buf_relse(agbp);
+ xfs_trans_brelse(tp, agbp);
+ xfs_trans_cancel(tp);
/* Now iterate the list to free the leftovers */
- list_for_each_entry(rr, &debris, rr_list) {
+ list_for_each_entry_safe(rr, n, &debris, rr_list) {
/* Set up transaction. */
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
if (error)
@@ -1676,8 +1682,16 @@ xfs_refcount_recover_cow_leftovers(
error = xfs_trans_commit(tp);
if (error)
goto out_free;
+
+ list_del(&rr->rr_list);
+ kmem_free(rr);
}
+ return error;
+out_defer:
+ xfs_defer_cancel(&dfops);
+out_trans:
+ xfs_trans_cancel(tp);
out_free:
/* Free the leftover list */
list_for_each_entry_safe(rr, n, &debris, rr_list) {
@@ -1688,11 +1702,6 @@ out_free:
out_cursor:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- xfs_buf_relse(agbp);
- goto out_free;
-
-out_defer:
- xfs_defer_cancel(&dfops);
- xfs_trans_cancel(tp);
- goto out_free;
+ xfs_trans_brelse(tp, agbp);
+ goto out_trans;
}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 098dc668ab2c..eafb9d1f3b37 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
xfs_agnumber_t agno);
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs. Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record. We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD 32
+
+static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
+{
+ return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 50add5272807..3c59dd3d58d7 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -202,7 +202,7 @@ xfs_refcountbt_init_ptr_from_cur(
ptr->s = agf->agf_refcount_root;
}
-STATIC __int64_t
+STATIC int64_t
xfs_refcountbt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
@@ -210,16 +210,16 @@ xfs_refcountbt_key_diff(
struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
struct xfs_refcount_key *kp = &key->refc;
- return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
}
-STATIC __int64_t
+STATIC int64_t
xfs_refcountbt_diff_two_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
be32_to_cpu(k2->refc.rc_startblock);
}
@@ -285,7 +285,6 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.verify_write = xfs_refcountbt_write_verify,
};
-#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_refcountbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -306,7 +305,6 @@ xfs_refcountbt_recs_inorder(
be32_to_cpu(r1->refc.rc_blockcount) <=
be32_to_cpu(r2->refc.rc_startblock);
}
-#endif
static const struct xfs_btree_ops xfs_refcountbt_ops = {
.rec_len = sizeof(struct xfs_refcount_rec),
@@ -325,10 +323,8 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
.key_diff = xfs_refcountbt_key_diff,
.buf_ops = &xfs_refcountbt_buf_ops,
.diff_two_keys = xfs_refcountbt_diff_two_keys,
-#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_refcountbt_keys_inorder,
.recs_inorder = xfs_refcountbt_recs_inorder,
-#endif
};
/*
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 06cfb93c2ef9..55c88a732690 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -179,7 +179,8 @@ done:
return error;
}
-static int
+/* Convert an internal btree record to an rmap record. */
+int
xfs_rmap_btrec_to_irec(
union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec)
@@ -2061,7 +2062,7 @@ int
xfs_rmap_finish_one(
struct xfs_trans *tp,
enum xfs_rmap_intent_type type,
- __uint64_t owner,
+ uint64_t owner,
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
@@ -2086,8 +2087,7 @@ xfs_rmap_finish_one(
startoff, blockcount, state);
if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_RMAP_FINISH_ONE,
- XFS_RANDOM_RMAP_FINISH_ONE))
+ XFS_ERRTAG_RMAP_FINISH_ONE))
return -EIO;
/*
@@ -2182,7 +2182,7 @@ __xfs_rmap_add(
struct xfs_mount *mp,
struct xfs_defer_ops *dfops,
enum xfs_rmap_intent_type type,
- __uint64_t owner,
+ uint64_t owner,
int whichfork,
struct xfs_bmbt_irec *bmap)
{
@@ -2266,7 +2266,7 @@ xfs_rmap_alloc_extent(
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
- __uint64_t owner)
+ uint64_t owner)
{
struct xfs_bmbt_irec bmap;
@@ -2290,7 +2290,7 @@ xfs_rmap_free_extent(
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
- __uint64_t owner)
+ uint64_t owner)
{
struct xfs_bmbt_irec bmap;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 98f908fea103..466ede637080 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -179,7 +179,7 @@ enum xfs_rmap_intent_type {
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
- __uint64_t ri_owner;
+ uint64_t ri_owner;
int ri_whichfork;
struct xfs_bmbt_irec ri_bmap;
};
@@ -196,15 +196,15 @@ int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_bmbt_irec *imap);
int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- __uint64_t owner);
+ uint64_t owner);
int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- __uint64_t owner);
+ uint64_t owner);
void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
- __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ uint64_t owner, int whichfork, xfs_fileoff_t startoff,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
@@ -216,5 +216,8 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_compare(const struct xfs_rmap_irec *a,
const struct xfs_rmap_irec *b);
+union xfs_btree_rec;
+int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 74e5a54bc428..9d9c9192584c 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -199,7 +199,7 @@ xfs_rmapbt_init_high_key_from_rec(
union xfs_btree_key *key,
union xfs_btree_rec *rec)
{
- __uint64_t off;
+ uint64_t off;
int adj;
adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
@@ -241,7 +241,7 @@ xfs_rmapbt_init_ptr_from_cur(
ptr->s = agf->agf_roots[cur->bc_btnum];
}
-STATIC __int64_t
+STATIC int64_t
xfs_rmapbt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
@@ -249,9 +249,9 @@ xfs_rmapbt_key_diff(
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
struct xfs_rmap_key *kp = &key->rmap;
__u64 x, y;
- __int64_t d;
+ int64_t d;
- d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
if (d)
return d;
@@ -271,7 +271,7 @@ xfs_rmapbt_key_diff(
return 0;
}
-STATIC __int64_t
+STATIC int64_t
xfs_rmapbt_diff_two_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
@@ -279,10 +279,10 @@ xfs_rmapbt_diff_two_keys(
{
struct xfs_rmap_key *kp1 = &k1->rmap;
struct xfs_rmap_key *kp2 = &k2->rmap;
- __int64_t d;
+ int64_t d;
__u64 x, y;
- d = (__int64_t)be32_to_cpu(kp1->rm_startblock) -
+ d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
be32_to_cpu(kp2->rm_startblock);
if (d)
return d;
@@ -377,17 +377,16 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.verify_write = xfs_rmapbt_write_verify,
};
-#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_rmapbt_keys_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- __uint32_t x;
- __uint32_t y;
- __uint64_t a;
- __uint64_t b;
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
x = be32_to_cpu(k1->rmap.rm_startblock);
y = be32_to_cpu(k2->rmap.rm_startblock);
@@ -414,10 +413,10 @@ xfs_rmapbt_recs_inorder(
union xfs_btree_rec *r1,
union xfs_btree_rec *r2)
{
- __uint32_t x;
- __uint32_t y;
- __uint64_t a;
- __uint64_t b;
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
x = be32_to_cpu(r1->rmap.rm_startblock);
y = be32_to_cpu(r2->rmap.rm_startblock);
@@ -437,7 +436,6 @@ xfs_rmapbt_recs_inorder(
return 1;
return 0;
}
-#endif /* DEBUG */
static const struct xfs_btree_ops xfs_rmapbt_ops = {
.rec_len = sizeof(struct xfs_rmap_rec),
@@ -456,10 +454,8 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.key_diff = xfs_rmapbt_key_diff,
.buf_ops = &xfs_rmapbt_buf_ops,
.diff_two_keys = xfs_rmapbt_diff_two_keys,
-#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
-#endif
};
/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e47b99e59f60..5d4e43ef4eea 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
-static int
+int
xfs_rtbuf_get(
xfs_mount_t *mp, /* file system mount structure */
xfs_trans_t *tp, /* transaction pointer */
@@ -1011,7 +1011,7 @@ xfs_rtfree_extent(
mp->m_sb.sb_rextents) {
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
- *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+ *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 584ec896a533..9b5aae2bcc0b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -448,7 +448,7 @@ xfs_sb_quota_to_disk(
struct xfs_dsb *to,
struct xfs_sb *from)
{
- __uint16_t qflags = from->sb_qflags;
+ uint16_t qflags = from->sb_qflags;
to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
if (xfs_sb_version_has_pquotino(from)) {
@@ -756,7 +756,7 @@ xfs_sb_mount_common(
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
- mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+ mp->m_ialloc_inos = (int)MAX((uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 2e2c6716b623..c484877129a0 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -114,7 +114,7 @@ xfs_symlink_verify(
if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
return false;
if (be32_to_cpu(dsl->sl_offset) +
- be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+ be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN)
return false;
if (dsl->sl_owner == 0)
return false;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index b456cca1bfb2..6bd916bd35e2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -477,14 +477,14 @@ xfs_calc_mkdir_reservation(
/*
* Making a new symplink is the same as creating a new file, but
* with the added blocks for remote symlink data which can be up to 1kB in
- * length (MAXPATHLEN).
+ * length (XFS_SYMLINK_MAXLEN).
*/
STATIC uint
xfs_calc_symlink_reservation(
struct xfs_mount *mp)
{
return xfs_calc_create_reservation(mp) +
- xfs_calc_buf_res(1, MAXPATHLEN);
+ xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN);
}
/*
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 717909f2f7b7..0220159bd463 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -18,34 +18,34 @@
#ifndef __XFS_TYPES_H__
#define __XFS_TYPES_H__
-typedef __uint32_t prid_t; /* project ID */
+typedef uint32_t prid_t; /* project ID */
-typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
-typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
-typedef __uint32_t xfs_extlen_t; /* extent length in blocks */
-typedef __uint32_t xfs_agnumber_t; /* allocation group number */
-typedef __int32_t xfs_extnum_t; /* # of extents in a file */
-typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */
-typedef __int64_t xfs_fsize_t; /* bytes in a file */
-typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
+typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
+typedef uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef uint32_t xfs_agnumber_t; /* allocation group number */
+typedef int32_t xfs_extnum_t; /* # of extents in a file */
+typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef int64_t xfs_fsize_t; /* bytes in a file */
+typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
-typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */
-typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */
+typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */
-typedef __int64_t xfs_lsn_t; /* log sequence number */
-typedef __int32_t xfs_tid_t; /* transaction identifier */
+typedef int64_t xfs_lsn_t; /* log sequence number */
+typedef int32_t xfs_tid_t; /* transaction identifier */
-typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
-typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
+typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
+typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
-typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
-typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
-typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
-typedef __uint64_t xfs_fileoff_t; /* block number in a file */
-typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
+typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
+typedef uint64_t xfs_fileoff_t; /* block number in a file */
+typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
-typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
+typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+typedef int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
* Null values for the types.
@@ -125,7 +125,7 @@ struct xfs_name {
* uid_t and gid_t are hard-coded to 32 bits in the inode.
* Hence, an 'id' in a dquot is 32 bits..
*/
-typedef __uint32_t xfs_dqid_t;
+typedef uint32_t xfs_dqid_t;
/*
* Constants for bit manipulations.
diff --git a/fs/xfs/uuid.c b/fs/xfs/uuid.c
deleted file mode 100644
index b83f76b6d410..000000000000
--- a/fs/xfs/uuid.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <xfs.h>
-
-/* IRIX interpretation of an uuid_t */
-typedef struct {
- __be32 uu_timelow;
- __be16 uu_timemid;
- __be16 uu_timehi;
- __be16 uu_clockseq;
- __be16 uu_node[3];
-} xfs_uu_t;
-
-/*
- * uuid_getnodeuniq - obtain the node unique fields of a UUID.
- *
- * This is not in any way a standard or condoned UUID function;
- * it just something that's needed for user-level file handles.
- */
-void
-uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
-{
- xfs_uu_t *uup = (xfs_uu_t *)uuid;
-
- fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
- be16_to_cpu(uup->uu_timemid);
- fsid[1] = be32_to_cpu(uup->uu_timelow);
-}
-
-int
-uuid_is_nil(uuid_t *uuid)
-{
- int i;
- char *cp = (char *)uuid;
-
- if (uuid == NULL)
- return 0;
- /* implied check of version number here... */
- for (i = 0; i < sizeof *uuid; i++)
- if (*cp++) return 0; /* not nil */
- return 1; /* is nil */
-}
-
-int
-uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
-{
- return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
-}
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
deleted file mode 100644
index 104db0f3bed6..000000000000
--- a/fs/xfs/uuid.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_UUID_H__
-#define __XFS_SUPPORT_UUID_H__
-
-typedef struct {
- unsigned char __u_bits[16];
-} uuid_t;
-
-extern int uuid_is_nil(uuid_t *uuid);
-extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
-extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-
-static inline void
-uuid_copy(uuid_t *dst, uuid_t *src)
-{
- memcpy(dst, src, sizeof(uuid_t));
-}
-
-#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index a742c47f7d5a..80cd0fd86783 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,6 +24,10 @@
#define XFS_BUF_LOCK_TRACKING 1
#endif
+#ifdef CONFIG_XFS_ASSERT_FATAL
+#define XFS_ASSERT_FATAL 1
+#endif
+
#ifdef CONFIG_XFS_WARN
#define XFS_WARN 1
#endif
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b468e041f207..7034e17535de 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -170,8 +170,8 @@ xfs_get_acl(struct inode *inode, int type)
return acl;
}
-STATIC int
-__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int
+__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
struct xfs_inode *ip = XFS_I(inode);
unsigned char *ea_name;
@@ -268,5 +268,5 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
set_acl:
- return __xfs_set_acl(inode, type, acl);
+ return __xfs_set_acl(inode, acl, type);
}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 286fa89217f5..04327318ef67 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -24,6 +24,7 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 09af0f7cd55e..6bf120bb1a17 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -276,7 +276,7 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
- int error = ioend->io_bio->bi_error;
+ int error;
/*
* Just clean up the in-memory strutures if the fs has been shut down.
@@ -289,6 +289,7 @@ xfs_end_io(
/*
* Clean up any COW blocks on an I/O error.
*/
+ error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
switch (ioend->io_type) {
case XFS_IO_COW:
@@ -332,7 +333,7 @@ xfs_end_bio(
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
else
- xfs_destroy_ioend(ioend, bio->bi_error);
+ xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
STATIC int
@@ -500,11 +501,12 @@ xfs_submit_ioend(
* time.
*/
if (status) {
- ioend->io_bio->bi_error = status;
+ ioend->io_bio->bi_status = errno_to_blk_status(status);
bio_endio(ioend->io_bio);
return status;
}
+ ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
submit_bio(ioend->io_bio);
return 0;
}
@@ -564,6 +566,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
@@ -836,7 +839,7 @@ xfs_writepage_map(
struct inode *inode,
struct page *page,
loff_t offset,
- __uint64_t end_offset)
+ uint64_t end_offset)
{
LIST_HEAD(submit_list);
struct xfs_ioend *ioend, *next;
@@ -991,7 +994,7 @@ xfs_do_writepage(
struct xfs_writepage_ctx *wpc = data;
struct inode *inode = page->mapping->host;
loff_t offset;
- __uint64_t end_offset;
+ uint64_t end_offset;
pgoff_t end_index;
trace_xfs_writepage(inode, page, 0, 0);
@@ -1316,9 +1319,12 @@ xfs_vm_bmap(
* The swap code (ab-)uses ->bmap to get a block mapping and then
* bypasseѕ the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
- * 0 is the magic code for a bmap error..
+ * 0 is the magic code for a bmap error.
+ *
+ * Since we don't pass back blockdev info, we can't return bmap
+ * information for rt files either.
*/
- if (xfs_is_reflink_inode(ip))
+ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
filemap_write_and_wait(mapping);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index d14691aa02b4..5d5a5e277f35 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -117,6 +117,7 @@ typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
typedef struct xfs_attr_list_context {
+ struct xfs_trans *tp;
struct xfs_inode *dp; /* inode */
struct attrlist_cursor_kern *cursor; /* position in list */
char *alist; /* output buffer */
@@ -140,8 +141,10 @@ typedef struct xfs_attr_list_context {
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
int xfs_attr_list_int(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
unsigned char *value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 97c45b6eb91e..545eca508d42 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -230,7 +230,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
+ error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
@@ -242,7 +242,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
case XFS_DA_NODE_MAGIC:
case XFS_DA3_NODE_MAGIC:
trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
bp = NULL;
break;
case XFS_ATTR_LEAF_MAGIC:
@@ -254,18 +254,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (cursor->hashval > be32_to_cpu(
entries[leafhdr.count - 1].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
bp = NULL;
} else if (cursor->hashval <= be32_to_cpu(
entries[0].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
bp = NULL;
}
break;
default:
trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
bp = NULL;
}
}
@@ -279,9 +279,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (bp == NULL) {
cursor->blkno = 0;
for (;;) {
- __uint16_t magic;
+ uint16_t magic;
- error = xfs_da3_node_read(NULL, dp,
+ error = xfs_da3_node_read(context->tp, dp,
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
@@ -297,7 +297,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
XFS_ERRLEVEL_LOW,
context->dp->i_mount,
node);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
return -EFSCORRUPTED;
}
@@ -313,10 +313,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
}
}
if (i == nodehdr.count) {
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
return 0;
}
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
}
}
ASSERT(bp != NULL);
@@ -333,12 +333,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (context->seen_enough || leafhdr.forw == 0)
break;
cursor->blkno = leafhdr.forw;
- xfs_trans_brelse(NULL, bp);
- error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
+ xfs_trans_brelse(context->tp, bp);
+ error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp);
if (error)
return error;
}
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
return 0;
}
@@ -448,16 +448,34 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
trace_xfs_attr_leaf_list(context);
context->cursor->blkno = 0;
- error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp);
if (error)
return error;
xfs_attr3_leaf_list_int(bp, context);
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(context->tp, bp);
return 0;
}
int
+xfs_attr_list_int_ilocked(
+ struct xfs_attr_list_context *context)
+{
+ struct xfs_inode *dp = context->dp;
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp))
+ return 0;
+ else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_shortform_list(context);
+ else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ return xfs_attr_leaf_list(context);
+ return xfs_attr_node_list(context);
+}
+
+int
xfs_attr_list_int(
xfs_attr_list_context_t *context)
{
@@ -470,19 +488,8 @@ xfs_attr_list_int(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- /*
- * Decide on what work routines to call based on the inode size.
- */
lock_mode = xfs_ilock_attr_map_shared(dp);
- if (!xfs_inode_hasattr(dp)) {
- error = 0;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- error = xfs_attr_shortform_list(context);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_list(context);
- } else {
- error = xfs_attr_node_list(context);
- }
+ error = xfs_attr_list_int_ilocked(context);
xfs_iunlock(dp, lock_mode);
return error;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index d419d23fa214..88073910fa5d 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -396,6 +396,7 @@ xfs_bui_recover(
struct xfs_map_extent *bmap;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
+ xfs_filblks_t count;
bool op_ok;
struct xfs_bud_log_item *budp;
enum xfs_bmap_intent_type type;
@@ -404,6 +405,7 @@ xfs_bui_recover(
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_defer_ops dfops;
+ struct xfs_bmbt_irec irec;
xfs_fsblock_t firstfsb;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -481,13 +483,24 @@ xfs_bui_recover(
}
xfs_trans_ijoin(tp, ip, 0);
+ count = bmap->me_len;
error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
ip, whichfork, bmap->me_startoff,
- bmap->me_startblock, bmap->me_len,
- state);
+ bmap->me_startblock, &count, state);
if (error)
goto err_dfops;
+ if (count > 0) {
+ ASSERT(type == XFS_BMAP_UNMAP);
+ irec.br_startblock = bmap->me_startblock;
+ irec.br_blockcount = count;
+ irec.br_startoff = bmap->me_startoff;
+ irec.br_state = state;
+ error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+ if (error)
+ goto err_dfops;
+ }
+
/* Finish transaction, free inodes. */
error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2b954308a1d6..93e955262d07 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -219,20 +219,24 @@ xfs_bmap_eof(
*/
/*
- * Count leaf blocks given a range of extent records.
+ * Count leaf blocks given a range of extent records. Delayed allocation
+ * extents are not counted towards the totals.
*/
STATIC void
xfs_bmap_count_leaves(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- int numrecs,
- int *count)
+ struct xfs_ifork *ifp,
+ xfs_extnum_t *numrecs,
+ xfs_filblks_t *count)
{
- int b;
-
- for (b = 0; b < numrecs; b++) {
- xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
- *count += xfs_bmbt_get_blockcount(frp);
+ xfs_extnum_t i;
+ xfs_extnum_t nr_exts = xfs_iext_count(ifp);
+
+ for (i = 0; i < nr_exts; i++) {
+ xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i);
+ if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) {
+ (*numrecs)++;
+ *count += xfs_bmbt_get_blockcount(frp);
+ }
}
}
@@ -245,7 +249,7 @@ xfs_bmap_disk_count_leaves(
struct xfs_mount *mp,
struct xfs_btree_block *block,
int numrecs,
- int *count)
+ xfs_filblks_t *count)
{
int b;
xfs_bmbt_rec_t *frp;
@@ -260,17 +264,18 @@ xfs_bmap_disk_count_leaves(
* Recursively walks each level of a btree
* to count total fsblocks in use.
*/
-STATIC int /* error */
+STATIC int
xfs_bmap_count_tree(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fsblock_t blockno, /* file system block number */
- int levelin, /* level in btree */
- int *count) /* Count of blocks */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_ifork *ifp,
+ xfs_fsblock_t blockno,
+ int levelin,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
{
int error;
- xfs_buf_t *bp, *nbp;
+ struct xfs_buf *bp, *nbp;
int level = levelin;
__be64 *pp;
xfs_fsblock_t bno = blockno;
@@ -303,8 +308,9 @@ xfs_bmap_count_tree(
/* Dive to the next level */
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
- if (unlikely((error =
- xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+ error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents,
+ count);
+ if (error) {
xfs_trans_brelse(tp, bp);
XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
XFS_ERRLEVEL_LOW, mp);
@@ -316,6 +322,7 @@ xfs_bmap_count_tree(
for (;;) {
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
numrecs = be16_to_cpu(block->bb_numrecs);
+ (*nextents) += numrecs;
xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
xfs_trans_brelse(tp, bp);
if (nextbno == NULLFSBLOCK)
@@ -334,46 +341,64 @@ xfs_bmap_count_tree(
}
/*
- * Count fsblocks of the given fork.
+ * Count fsblocks of the given fork. Delayed allocation extents are
+ * not counted towards the totals.
*/
-static int /* error */
+int
xfs_bmap_count_blocks(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- int *count) /* out: count of blocks */
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
{
+ struct xfs_mount *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
struct xfs_btree_block *block; /* current btree block */
+ struct xfs_ifork *ifp; /* fork structure */
xfs_fsblock_t bno; /* block # of "block" */
- xfs_ifork_t *ifp; /* fork structure */
int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
+ int error;
bno = NULLFSBLOCK;
mp = ip->i_mount;
+ *nextents = 0;
+ *count = 0;
ifp = XFS_IFORK_PTR(ip, whichfork);
- if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count);
+ if (!ifp)
return 0;
- }
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
- mp);
- return -EFSCORRUPTED;
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_EXTENTS:
+ xfs_bmap_count_leaves(ifp, nextents, count);
+ return 0;
+ case XFS_DINODE_FMT_BTREE:
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ block = ifp->if_broot;
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ error = xfs_bmap_count_tree(mp, tp, ifp, bno, level,
+ nextents, count);
+ if (error) {
+ XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
return 0;
@@ -389,11 +414,11 @@ xfs_getbmapx_fix_eof_hole(
struct getbmapx *out, /* output structure */
int prealloced, /* this is a file with
* preallocated data space */
- __int64_t end, /* last block requested */
+ int64_t end, /* last block requested */
xfs_fsblock_t startblock,
bool moretocome)
{
- __int64_t fixlen;
+ int64_t fixlen;
xfs_mount_t *mp; /* file system mount point */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_extnum_t lastx; /* last extent pointer */
@@ -455,8 +480,8 @@ xfs_getbmap_adjust_shared(
agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
- error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount,
- &ebno, &elen, true);
+ error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
+ map->br_blockcount, &ebno, &elen, true);
if (error)
return error;
@@ -514,9 +539,9 @@ xfs_getbmap(
xfs_bmap_format_t formatter, /* format to user */
void *arg) /* formatter arg */
{
- __int64_t bmvend; /* last block requested */
+ int64_t bmvend; /* last block requested */
int error = 0; /* return value */
- __int64_t fixlen; /* length for -1 case */
+ int64_t fixlen; /* length for -1 case */
int i; /* extent number */
int lock; /* lock state */
xfs_bmbt_irec_t *map; /* buffer for user's data */
@@ -582,9 +607,13 @@ xfs_getbmap(
}
break;
default:
+ /* Local format data forks report no extents. */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ bmv->bmv_entries = 0;
+ return 0;
+ }
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
return -EINVAL;
if (xfs_get_extsz_hint(ip) ||
@@ -601,7 +630,7 @@ xfs_getbmap(
if (bmv->bmv_length == -1) {
fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
bmv->bmv_length =
- max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+ max_t(int64_t, fixlen - bmv->bmv_offset, 0);
} else if (bmv->bmv_length == 0) {
bmv->bmv_entries = 0;
return 0;
@@ -712,7 +741,7 @@ xfs_getbmap(
* extents.
*/
if (map[i].br_startblock == DELAYSTARTBLOCK &&
- map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+ map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
ASSERT((iflags & BMV_IF_DELALLOC) != 0);
if (map[i].br_startblock == HOLESTARTBLOCK &&
@@ -738,7 +767,7 @@ xfs_getbmap(
out[cur_ext].bmv_offset +
out[cur_ext].bmv_length;
bmv->bmv_length =
- max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+ max_t(int64_t, 0, bmvend - bmv->bmv_offset);
/*
* In case we don't want to return the hole,
@@ -1613,7 +1642,7 @@ xfs_swap_extents_check_format(
* extent format...
*/
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_BOFF(ip) &&
+ if (XFS_IFORK_Q(ip) &&
XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
return -EINVAL;
if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
@@ -1623,7 +1652,7 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_BOFF(tip) &&
+ if (XFS_IFORK_Q(tip) &&
XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
return -EINVAL;
if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
@@ -1672,7 +1701,7 @@ xfs_swap_extent_rmap(
xfs_filblks_t ilen;
xfs_filblks_t rlen;
int nimaps;
- __uint64_t tip_flags2;
+ uint64_t tip_flags2;
/*
* If the source file has shared blocks, we must flag the donor
@@ -1785,10 +1814,11 @@ xfs_swap_extent_forks(
int *target_log_flags)
{
struct xfs_ifork tempifp, *ifp, *tifp;
- int aforkblks = 0;
- int taforkblks = 0;
+ xfs_filblks_t aforkblks = 0;
+ xfs_filblks_t taforkblks = 0;
+ xfs_extnum_t junk;
xfs_extnum_t nextents;
- __uint64_t tmp;
+ uint64_t tmp;
int error;
/*
@@ -1796,14 +1826,14 @@ xfs_swap_extent_forks(
*/
if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK,
+ error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
&aforkblks);
if (error)
return error;
}
if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
(tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+ error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
&taforkblks);
if (error)
return error;
@@ -1846,15 +1876,15 @@ xfs_swap_extent_forks(
/*
* Fix the on-disk inode values
*/
- tmp = (__uint64_t)ip->i_d.di_nblocks;
+ tmp = (uint64_t)ip->i_d.di_nblocks;
ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
- tmp = (__uint64_t) ip->i_d.di_nextents;
+ tmp = (uint64_t) ip->i_d.di_nextents;
ip->i_d.di_nextents = tip->i_d.di_nextents;
tip->i_d.di_nextents = tmp;
- tmp = (__uint64_t) ip->i_d.di_format;
+ tmp = (uint64_t) ip->i_d.di_format;
ip->i_d.di_format = tip->i_d.di_format;
tip->i_d.di_format = tmp;
@@ -1923,7 +1953,7 @@ xfs_swap_extents(
int error = 0;
int lock_flags;
struct xfs_ifork *cowfp;
- __uint64_t f;
+ uint64_t f;
int resblks;
/*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 135d8267e284..0cede1043571 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -70,4 +70,8 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_extnum_t *nextents,
+ xfs_filblks_t *count);
+
#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 62fa39276a24..72f038492ba8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -97,12 +97,16 @@ static inline void
xfs_buf_ioacct_inc(
struct xfs_buf *bp)
{
- if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+ if (bp->b_flags & XBF_NO_IOACCT)
return;
ASSERT(bp->b_flags & XBF_ASYNC);
- bp->b_flags |= _XBF_IN_FLIGHT;
- percpu_counter_inc(&bp->b_target->bt_io_count);
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
+ bp->b_state |= XFS_BSTATE_IN_FLIGHT;
+ percpu_counter_inc(&bp->b_target->bt_io_count);
+ }
+ spin_unlock(&bp->b_lock);
}
/*
@@ -110,14 +114,24 @@ xfs_buf_ioacct_inc(
* freed and unaccount from the buftarg.
*/
static inline void
-xfs_buf_ioacct_dec(
+__xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
- if (!(bp->b_flags & _XBF_IN_FLIGHT))
- return;
+ lockdep_assert_held(&bp->b_lock);
- bp->b_flags &= ~_XBF_IN_FLIGHT;
- percpu_counter_dec(&bp->b_target->bt_io_count);
+ if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+ bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+ percpu_counter_dec(&bp->b_target->bt_io_count);
+ }
+}
+
+static inline void
+xfs_buf_ioacct_dec(
+ struct xfs_buf *bp)
+{
+ spin_lock(&bp->b_lock);
+ __xfs_buf_ioacct_dec(bp);
+ spin_unlock(&bp->b_lock);
}
/*
@@ -149,9 +163,9 @@ xfs_buf_stale(
* unaccounted (released to LRU) before that occurs. Drop in-flight
* status now to preserve accounting consistency.
*/
- xfs_buf_ioacct_dec(bp);
-
spin_lock(&bp->b_lock);
+ __xfs_buf_ioacct_dec(bp);
+
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
@@ -979,12 +993,12 @@ xfs_buf_rele(
* ensures the decrement occurs only once per-buf.
*/
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
- xfs_buf_ioacct_dec(bp);
+ __xfs_buf_ioacct_dec(bp);
goto out_unlock;
}
/* the last reference has been dropped ... */
- xfs_buf_ioacct_dec(bp);
+ __xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/*
* If the buffer is added to the LRU take a new reference to the
@@ -1180,7 +1194,7 @@ xfs_buf_ioerror_alert(
{
xfs_alert(bp->b_target->bt_mount,
"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
- (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
+ (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
}
int
@@ -1213,8 +1227,11 @@ xfs_buf_bio_end_io(
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (bio->bi_error)
- cmpxchg(&bp->b_io_error, 0, bio->bi_error);
+ if (bio->bi_status) {
+ int error = blk_status_to_errno(bio->bi_status);
+
+ cmpxchg(&bp->b_io_error, 0, error);
+ }
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -2033,6 +2050,66 @@ xfs_buf_delwri_submit(
return error;
}
+/*
+ * Push a single buffer on a delwri queue.
+ *
+ * The purpose of this function is to submit a single buffer of a delwri queue
+ * and return with the buffer still on the original queue. The waiting delwri
+ * buffer submission infrastructure guarantees transfer of the delwri queue
+ * buffer reference to a temporary wait list. We reuse this infrastructure to
+ * transfer the buffer back to the original queue.
+ *
+ * Note the buffer transitions from the queued state, to the submitted and wait
+ * listed state and back to the queued state during this call. The buffer
+ * locking and queue management logic between _delwri_pushbuf() and
+ * _delwri_queue() guarantee that the buffer cannot be queued to another list
+ * before returning.
+ */
+int
+xfs_buf_delwri_pushbuf(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (submit_list);
+ int error;
+
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+ trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
+
+ /*
+ * Isolate the buffer to a new local list so we can submit it for I/O
+ * independently from the rest of the original list.
+ */
+ xfs_buf_lock(bp);
+ list_move(&bp->b_list, &submit_list);
+ xfs_buf_unlock(bp);
+
+ /*
+ * Delwri submission clears the DELWRI_Q buffer flag and returns with
+ * the buffer on the wait list with an associated reference. Rather than
+ * bounce the buffer from a local wait list back to the original list
+ * after I/O completion, reuse the original list as the wait list.
+ */
+ xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+
+ /*
+ * The buffer is now under I/O and wait listed as during typical delwri
+ * submission. Lock the buffer to wait for I/O completion. Rather than
+ * remove the buffer from the wait list and release the reference, we
+ * want to return with the buffer queued to the original list. The
+ * buffer already sits on the original list with a wait list reference,
+ * however. If we let the queue inherit that wait list reference, all we
+ * need to do is reset the DELWRI_Q flag.
+ */
+ xfs_buf_lock(bp);
+ error = bp->b_error;
+ bp->b_flags |= _XBF_DELWRI_Q;
+ xfs_buf_unlock(bp);
+
+ return error;
+}
+
int __init
xfs_buf_init(void)
{
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8d1d44f87ce9..20721261dae5 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -63,7 +63,6 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
-#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
typedef unsigned int xfs_buf_flags_t;
@@ -84,14 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }, \
- { _XBF_IN_FLIGHT, "IN_FLIGHT" }
+ { _XBF_COMPOUND, "COMPOUND" }
/*
* Internal state flags.
*/
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
+#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
/*
* The xfs_buftarg contains 2 notions of "sector size" -
@@ -333,6 +332,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0306168af332..f6a8422e9562 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -636,20 +636,23 @@ xfs_buf_item_unlock(
/*
* Clean buffers, by definition, cannot be in the AIL. However, aborted
- * buffers may be dirty and hence in the AIL. Therefore if we are
- * aborting a buffer and we've just taken the last refernce away, we
- * have to check if it is in the AIL before freeing it. We need to free
- * it in this case, because an aborted transaction has already shut the
- * filesystem down and this is the last chance we will have to do so.
+ * buffers may be in the AIL regardless of dirty state. An aborted
+ * transaction that invalidates a buffer already in the AIL may have
+ * marked it stale and cleared the dirty state, for example.
+ *
+ * Therefore if we are aborting a buffer and we've just taken the last
+ * reference away, we have to check if it is in the AIL before freeing
+ * it. We need to free it in this case, because an aborted transaction
+ * has already shut the filesystem down and this is the last chance we
+ * will have to do so.
*/
if (atomic_dec_and_test(&bip->bli_refcount)) {
- if (clean)
- xfs_buf_item_relse(bp);
- else if (aborted) {
+ if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
- }
+ } else if (clean)
+ xfs_buf_item_relse(bp);
}
if (!(flags & XFS_BLI_HOLD))
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 20b7a5c6eb2f..ba2638d37031 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -44,7 +44,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
static unsigned char
xfs_dir3_get_dtype(
struct xfs_mount *mp,
- __uint8_t filetype)
+ uint8_t filetype)
{
if (!xfs_sb_version_hasftype(&mp->m_sb))
return DT_UNKNOWN;
@@ -117,7 +117,7 @@ xfs_dir2_sf_getdents(
*/
sfep = xfs_dir2_sf_firstentry(sfp);
for (i = 0; i < sfp->count; i++) {
- __uint8_t filetype;
+ uint8_t filetype;
off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
xfs_dir2_sf_get_offset(sfep));
@@ -170,7 +170,7 @@ xfs_dir2_block_getdents(
return 0;
lock_mode = xfs_ilock_data_map_shared(dp);
- error = xfs_dir3_block_read(NULL, dp, &bp);
+ error = xfs_dir3_block_read(args->trans, dp, &bp);
xfs_iunlock(dp, lock_mode);
if (error)
return error;
@@ -194,7 +194,7 @@ xfs_dir2_block_getdents(
* Each object is a real entry (dep) or an unused one (dup).
*/
while (ptr < endptr) {
- __uint8_t filetype;
+ uint8_t filetype;
dup = (xfs_dir2_data_unused_t *)ptr;
/*
@@ -228,7 +228,7 @@ xfs_dir2_block_getdents(
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype))) {
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(args->trans, bp);
return 0;
}
}
@@ -239,218 +239,104 @@ xfs_dir2_block_getdents(
*/
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
0x7fffffff;
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(args->trans, bp);
return 0;
}
-struct xfs_dir2_leaf_map_info {
- xfs_extlen_t map_blocks; /* number of fsbs in map */
- xfs_dablk_t map_off; /* last mapped file offset */
- int map_size; /* total entries in *map */
- int map_valid; /* valid entries in *map */
- int nmap; /* mappings to ask xfs_bmapi */
- xfs_dir2_db_t curdb; /* db for current block */
- int ra_current; /* number of read-ahead blks */
- int ra_index; /* *map index for read-ahead */
- int ra_offset; /* map entry offset for ra */
- int ra_want; /* readahead count wanted */
- struct xfs_bmbt_irec map[]; /* map vector for blocks */
-};
-
+/*
+ * Read a directory block and initiate readahead for blocks beyond that.
+ * We maintain a sliding readahead window of the remaining space in the
+ * buffer rounded up to the nearest block.
+ */
STATIC int
xfs_dir2_leaf_readbuf(
struct xfs_da_args *args,
size_t bufsize,
- struct xfs_dir2_leaf_map_info *mip,
- xfs_dir2_off_t *curoff,
- struct xfs_buf **bpp,
- bool trim_map)
+ xfs_dir2_off_t *cur_off,
+ xfs_dablk_t *ra_blk,
+ struct xfs_buf **bpp)
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
- struct xfs_bmbt_irec *map = mip->map;
+ struct xfs_da_geometry *geo = args->geo;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_bmbt_irec map;
struct blk_plug plug;
+ xfs_dir2_off_t new_off;
+ xfs_dablk_t next_ra;
+ xfs_dablk_t map_off;
+ xfs_dablk_t last_da;
+ xfs_extnum_t idx;
+ int ra_want;
int error = 0;
- int length;
- int i;
- int j;
- struct xfs_da_geometry *geo = args->geo;
-
- /*
- * If the caller just finished processing a buffer, it will tell us
- * we need to trim that block out of the mapping now it is done.
- */
- if (trim_map) {
- mip->map_blocks -= geo->fsbcount;
- /*
- * Loop to get rid of the extents for the
- * directory block.
- */
- for (i = geo->fsbcount; i > 0; ) {
- j = min_t(int, map->br_blockcount, i);
- map->br_blockcount -= j;
- map->br_startblock += j;
- map->br_startoff += j;
- /*
- * If mapping is done, pitch it from
- * the table.
- */
- if (!map->br_blockcount && --mip->map_valid)
- memmove(&map[0], &map[1],
- sizeof(map[0]) * mip->map_valid);
- i -= j;
- }
- }
- /*
- * Recalculate the readahead blocks wanted.
- */
- mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
- ASSERT(mip->ra_want >= 0);
-
- /*
- * If we don't have as many as we want, and we haven't
- * run out of data blocks, get some more mappings.
- */
- if (1 + mip->ra_want > mip->map_blocks &&
- mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
- /*
- * Get more bmaps, fill in after the ones
- * we already have in the table.
- */
- mip->nmap = mip->map_size - mip->map_valid;
- error = xfs_bmapi_read(dp, mip->map_off,
- xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
- mip->map_off,
- &map[mip->map_valid], &mip->nmap, 0);
-
- /*
- * Don't know if we should ignore this or try to return an
- * error. The trouble with returning errors is that readdir
- * will just stop without actually passing the error through.
- */
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
if (error)
- goto out; /* XXX */
-
- /*
- * If we got all the mappings we asked for, set the final map
- * offset based on the last bmap value received. Otherwise,
- * we've reached the end.
- */
- if (mip->nmap == mip->map_size - mip->map_valid) {
- i = mip->map_valid + mip->nmap - 1;
- mip->map_off = map[i].br_startoff + map[i].br_blockcount;
- } else
- mip->map_off = xfs_dir2_byte_to_da(geo,
- XFS_DIR2_LEAF_OFFSET);
-
- /*
- * Look for holes in the mapping, and eliminate them. Count up
- * the valid blocks.
- */
- for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
- if (map[i].br_startblock == HOLESTARTBLOCK) {
- mip->nmap--;
- length = mip->map_valid + mip->nmap - i;
- if (length)
- memmove(&map[i], &map[i + 1],
- sizeof(map[i]) * length);
- } else {
- mip->map_blocks += map[i].br_blockcount;
- i++;
- }
- }
- mip->map_valid += mip->nmap;
+ goto out;
}
/*
- * No valid mappings, so no more data blocks.
+ * Look for mapped directory blocks at or above the current offset.
+ * Truncate down to the nearest directory block to start the scanning
+ * operation.
*/
- if (!mip->map_valid) {
- *curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+ map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
+ if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map))
goto out;
- }
+ if (map.br_startoff >= last_da)
+ goto out;
+ xfs_trim_extent(&map, map_off, last_da - map_off);
- /*
- * Read the directory block starting at the first mapping.
- */
- mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
- error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
- map->br_blockcount >= geo->fsbcount ?
- XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
- -1, &bp);
- /*
- * Should just skip over the data block instead of giving up.
- */
+ /* Read the directory block of that first mapping. */
+ new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+ if (new_off > *cur_off)
+ *cur_off = new_off;
+ error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
if (error)
- goto out; /* XXX */
-
- /*
- * Adjust the current amount of read-ahead: we just read a block that
- * was previously ra.
- */
- if (mip->ra_current)
- mip->ra_current -= geo->fsbcount;
+ goto out;
/*
- * Do we need more readahead?
- * Each loop tries to process 1 full dir blk; last may be partial.
+ * Start readahead for the next bufsize's worth of dir data blocks.
+ * We may have already issued readahead for some of that range;
+ * ra_blk tracks the last block we tried to read(ahead).
*/
+ ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+ if (*ra_blk >= last_da)
+ goto out;
+ else if (*ra_blk == 0)
+ *ra_blk = map.br_startoff;
+ next_ra = map.br_startoff + geo->fsbcount;
+ if (next_ra >= last_da)
+ goto out_no_ra;
+ if (map.br_blockcount < geo->fsbcount &&
+ !xfs_iext_get_extent(ifp, ++idx, &map))
+ goto out_no_ra;
+ if (map.br_startoff >= last_da)
+ goto out_no_ra;
+ xfs_trim_extent(&map, next_ra, last_da - next_ra);
+
+ /* Start ra for each dir (not fs) block that has a mapping. */
blk_start_plug(&plug);
- for (mip->ra_index = mip->ra_offset = i = 0;
- mip->ra_want > mip->ra_current && i < mip->map_blocks;
- i += geo->fsbcount) {
- ASSERT(mip->ra_index < mip->map_valid);
- /*
- * Read-ahead a contiguous directory block.
- */
- if (i > mip->ra_current &&
- (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
- geo->fsbcount) {
- xfs_dir3_data_readahead(dp,
- map[mip->ra_index].br_startoff + mip->ra_offset,
- XFS_FSB_TO_DADDR(dp->i_mount,
- map[mip->ra_index].br_startblock +
- mip->ra_offset));
- mip->ra_current = i;
- }
-
- /*
- * Read-ahead a non-contiguous directory block. This doesn't
- * use our mapping, but this is a very rare case.
- */
- else if (i > mip->ra_current) {
- xfs_dir3_data_readahead(dp,
- map[mip->ra_index].br_startoff +
- mip->ra_offset, -1);
- mip->ra_current = i;
- }
-
- /*
- * Advance offset through the mapping table, processing a full
- * dir block even if it is fragmented into several extents.
- * But stop if we have consumed all valid mappings, even if
- * it's not yet a full directory block.
- */
- for (j = 0;
- j < geo->fsbcount && mip->ra_index < mip->map_valid;
- j += length ) {
- /*
- * The rest of this extent but not more than a dir
- * block.
- */
- length = min_t(int, geo->fsbcount - j,
- map[mip->ra_index].br_blockcount -
- mip->ra_offset);
- mip->ra_offset += length;
-
- /*
- * Advance to the next mapping if this one is used up.
- */
- if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
- mip->ra_offset = 0;
- mip->ra_index++;
+ while (ra_want > 0) {
+ next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
+ while (ra_want > 0 &&
+ next_ra < map.br_startoff + map.br_blockcount) {
+ if (next_ra >= last_da) {
+ *ra_blk = last_da;
+ break;
}
+ if (next_ra > *ra_blk) {
+ xfs_dir3_data_readahead(dp, next_ra, -2);
+ *ra_blk = next_ra;
+ }
+ ra_want -= geo->fsbcount;
+ next_ra += geo->fsbcount;
+ }
+ if (!xfs_iext_get_extent(ifp, ++idx, &map)) {
+ *ra_blk = last_da;
+ break;
}
}
blk_finish_plug(&plug);
@@ -458,6 +344,9 @@ xfs_dir2_leaf_readbuf(
out:
*bpp = bp;
return error;
+out_no_ra:
+ *ra_blk = last_da;
+ goto out;
}
/*
@@ -475,14 +364,14 @@ xfs_dir2_leaf_getdents(
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
- int error = 0; /* error return value */
- int length; /* temporary length value */
- int byteoff; /* offset in current block */
- xfs_dir2_off_t curoff; /* current overall offset */
- xfs_dir2_off_t newoff; /* new curoff after new blk */
char *ptr = NULL; /* pointer to current data */
- struct xfs_dir2_leaf_map_info *map_info;
struct xfs_da_geometry *geo = args->geo;
+ xfs_dablk_t rablk = 0; /* current readahead block */
+ xfs_dir2_off_t curoff; /* current overall offset */
+ int length; /* temporary length value */
+ int byteoff; /* offset in current block */
+ int lock_mode;
+ int error = 0; /* error return value */
/*
* If the offset is at or past the largest allowed value,
@@ -492,73 +381,35 @@ xfs_dir2_leaf_getdents(
return 0;
/*
- * Set up to bmap a number of blocks based on the caller's
- * buffer size, the directory block size, and the filesystem
- * block size.
- */
- length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
- map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
- (length * sizeof(struct xfs_bmbt_irec)),
- KM_SLEEP | KM_NOFS);
- map_info->map_size = length;
-
- /*
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
/*
- * Force this conversion through db so we truncate the offset
- * down to get the start of the data block.
- */
- map_info->map_off = xfs_dir2_db_to_da(geo,
- xfs_dir2_byte_to_db(geo, curoff));
-
- /*
* Loop over directory entries until we reach the end offset.
* Get more blocks and readahead as necessary.
*/
while (curoff < XFS_DIR2_LEAF_OFFSET) {
- __uint8_t filetype;
+ uint8_t filetype;
/*
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
- int lock_mode;
- bool trim_map = false;
-
if (bp) {
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(args->trans, bp);
bp = NULL;
- trim_map = true;
}
lock_mode = xfs_ilock_data_map_shared(dp);
- error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
- &curoff, &bp, trim_map);
+ error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
+ &rablk, &bp);
xfs_iunlock(dp, lock_mode);
- if (error || !map_info->map_valid)
+ if (error || !bp)
break;
- /*
- * Having done a read, we need to set a new offset.
- */
- newoff = xfs_dir2_db_off_to_byte(geo,
- map_info->curdb, 0);
- /*
- * Start of the current block.
- */
- if (curoff < newoff)
- curoff = newoff;
- /*
- * Make sure we're in the right block.
- */
- else if (curoff > newoff)
- ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
- map_info->curdb);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
@@ -643,17 +494,22 @@ xfs_dir2_leaf_getdents(
ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
- kmem_free(map_info);
if (bp)
- xfs_trans_brelse(NULL, bp);
+ xfs_trans_brelse(args->trans, bp);
return error;
}
/*
* Read a directory.
+ *
+ * If supplied, the transaction collects locked dir buffers to avoid
+ * nested buffer deadlocks. This function does not dirty the
+ * transaction. The caller should ensure that the inode is locked
+ * before calling this function.
*/
int
xfs_readdir(
+ struct xfs_trans *tp,
struct xfs_inode *dp,
struct dir_context *ctx,
size_t bufsize)
@@ -672,6 +528,7 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
+ args.trans = tp;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 6a05d278da64..b2cde5426182 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -39,7 +39,7 @@ xfs_trim_extents(
xfs_daddr_t start,
xfs_daddr_t end,
xfs_daddr_t minlen,
- __uint64_t *blocks_trimmed)
+ uint64_t *blocks_trimmed)
{
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
@@ -166,7 +166,7 @@ xfs_ioc_trim(
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
- __uint64_t blocks_trimmed = 0;
+ uint64_t blocks_trimmed = 0;
int error, last_error = 0;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9d06cc30e875..f89f7b5241e6 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -276,7 +276,7 @@ xfs_qm_init_dquot_blk(
void
xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
{
- __uint64_t space;
+ uint64_t space;
dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
@@ -472,18 +472,23 @@ xfs_qm_dqtobp(
struct xfs_mount *mp = dqp->q_mount;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
struct xfs_trans *tp = (tpp ? *tpp : NULL);
- uint lock_mode;
+ uint lock_mode = 0;
quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
- lock_mode = xfs_ilock_data_map_shared(quotip);
+ ASSERT(!(flags & XFS_QMOPT_NOLOCK) ||
+ xfs_isilocked(quotip, XFS_ILOCK_SHARED) ||
+ xfs_isilocked(quotip, XFS_ILOCK_EXCL));
+ if (!(flags & XFS_QMOPT_NOLOCK))
+ lock_mode = xfs_ilock_data_map_shared(quotip);
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
*/
- xfs_iunlock(quotip, lock_mode);
+ if (lock_mode)
+ xfs_iunlock(quotip, lock_mode);
return -ESRCH;
}
@@ -493,7 +498,8 @@ xfs_qm_dqtobp(
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
- xfs_iunlock(quotip, lock_mode);
+ if (lock_mode)
+ xfs_iunlock(quotip, lock_mode);
if (error)
return error;
@@ -695,21 +701,18 @@ error0:
*/
static int
xfs_dq_get_next_id(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
uint type,
- xfs_dqid_t *id,
- loff_t eof)
+ xfs_dqid_t *id)
{
- struct xfs_inode *quotip;
+ struct xfs_inode *quotip = xfs_quota_inode(mp, type);
+ xfs_dqid_t next_id = *id + 1; /* simple advance */
+ uint lock_flags;
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
xfs_fsblock_t start;
- loff_t offset;
- uint lock;
- xfs_dqid_t next_id;
int error = 0;
- /* Simple advance */
- next_id = *id + 1;
-
/* If we'd wrap past the max ID, stop */
if (next_id < *id)
return -ENOENT;
@@ -723,23 +726,25 @@ xfs_dq_get_next_id(
/* Nope, next_id is now past the current chunk, so find the next one */
start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
- quotip = xfs_quota_inode(mp, type);
- lock = xfs_ilock_data_map_shared(quotip);
-
- offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
- eof, SEEK_DATA);
- if (offset < 0)
- error = offset;
+ lock_flags = xfs_ilock_data_map_shared(quotip);
+ if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
- xfs_iunlock(quotip, lock);
+ if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &idx, &got)) {
+ /* contiguous chunk, bump startoff for the id calculation */
+ if (got.br_startoff < start)
+ got.br_startoff = start;
+ *id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk;
+ } else {
+ error = -ENOENT;
+ }
- /* -ENXIO is essentially "no more data" */
- if (error)
- return (error == -ENXIO ? -ENOENT: error);
+ xfs_iunlock(quotip, lock_flags);
- /* Convert next data offset back to a quota id */
- *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
- return 0;
+ return error;
}
/*
@@ -762,7 +767,6 @@ xfs_qm_dqget(
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
- loff_t eof = 0;
int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -790,21 +794,6 @@ xfs_qm_dqget(
}
#endif
- /* Get the end of the quota file if we need it */
- if (flags & XFS_QMOPT_DQNEXT) {
- struct xfs_inode *quotip;
- xfs_fileoff_t last;
- uint lock_mode;
-
- quotip = xfs_quota_inode(mp, type);
- lock_mode = xfs_ilock_data_map_shared(quotip);
- error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
- xfs_iunlock(quotip, lock_mode);
- if (error)
- return error;
- eof = XFS_FSB_TO_B(mp, last);
- }
-
restart:
mutex_lock(&qi->qi_tree_lock);
dqp = radix_tree_lookup(tree, id);
@@ -823,7 +812,7 @@ restart:
if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
xfs_dqunlock(dqp);
mutex_unlock(&qi->qi_tree_lock);
- error = xfs_dq_get_next_id(mp, type, &id, eof);
+ error = xfs_dq_get_next_id(mp, type, &id);
if (error)
return error;
goto restart;
@@ -858,7 +847,7 @@ restart:
/* If we are asked to find next active id, keep looking */
if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
- error = xfs_dq_get_next_id(mp, type, &id, eof);
+ error = xfs_dq_get_next_id(mp, type, &id);
if (!error)
goto restart;
}
@@ -917,7 +906,7 @@ restart:
if (flags & XFS_QMOPT_DQNEXT) {
if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
xfs_qm_dqput(dqp);
- error = xfs_dq_get_next_id(mp, type, &id, eof);
+ error = xfs_dq_get_next_id(mp, type, &id);
if (error)
return error;
goto restart;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed7ee4e8af73..2f4feb959bfb 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -22,103 +22,280 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_error.h"
+#include "xfs_sysfs.h"
#ifdef DEBUG
-int xfs_etest[XFS_NUM_INJECT_ERROR];
-int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
-char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
-int xfs_error_test_active;
+static unsigned int xfs_errortag_random_default[] = {
+ XFS_RANDOM_DEFAULT,
+ XFS_RANDOM_IFLUSH_1,
+ XFS_RANDOM_IFLUSH_2,
+ XFS_RANDOM_IFLUSH_3,
+ XFS_RANDOM_IFLUSH_4,
+ XFS_RANDOM_IFLUSH_5,
+ XFS_RANDOM_IFLUSH_6,
+ XFS_RANDOM_DA_READ_BUF,
+ XFS_RANDOM_BTREE_CHECK_LBLOCK,
+ XFS_RANDOM_BTREE_CHECK_SBLOCK,
+ XFS_RANDOM_ALLOC_READ_AGF,
+ XFS_RANDOM_IALLOC_READ_AGI,
+ XFS_RANDOM_ITOBP_INOTOBP,
+ XFS_RANDOM_IUNLINK,
+ XFS_RANDOM_IUNLINK_REMOVE,
+ XFS_RANDOM_DIR_INO_VALIDATE,
+ XFS_RANDOM_BULKSTAT_READ_CHUNK,
+ XFS_RANDOM_IODONE_IOERR,
+ XFS_RANDOM_STRATREAD_IOERR,
+ XFS_RANDOM_STRATCMPL_IOERR,
+ XFS_RANDOM_DIOWRITE_IOERR,
+ XFS_RANDOM_BMAPIFORMAT,
+ XFS_RANDOM_FREE_EXTENT,
+ XFS_RANDOM_RMAP_FINISH_ONE,
+ XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
+ XFS_RANDOM_REFCOUNT_FINISH_ONE,
+ XFS_RANDOM_BMAP_FINISH_ONE,
+ XFS_RANDOM_AG_RESV_CRITICAL,
+ XFS_RANDOM_DROP_WRITES,
+ XFS_RANDOM_LOG_BAD_CRC,
+};
-int
-xfs_error_test(int error_tag, int *fsidp, char *expression,
- int line, char *file, unsigned long randfactor)
+struct xfs_errortag_attr {
+ struct attribute attr;
+ unsigned int tag;
+};
+
+static inline struct xfs_errortag_attr *
+to_attr(struct attribute *attr)
{
- int i;
- int64_t fsid;
+ return container_of(attr, struct xfs_errortag_attr, attr);
+}
- if (prandom_u32() % randfactor)
- return 0;
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
- memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
+ return container_of(kobj, struct xfs_mount, m_errortag_kobj);
+}
+
+STATIC ssize_t
+xfs_errortag_attr_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+ struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+ int ret;
+ unsigned int val;
- for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
- if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
- xfs_warn(NULL,
- "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
- expression, file, line, xfs_etest_fsname[i]);
- return 1;
- }
+ if (strcmp(buf, "default") == 0) {
+ val = xfs_errortag_random_default[xfs_attr->tag];
+ } else {
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
}
- return 0;
+ ret = xfs_errortag_set(mp, xfs_attr->tag, val);
+ if (ret)
+ return ret;
+ return count;
}
+STATIC ssize_t
+xfs_errortag_attr_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+ struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ xfs_errortag_get(mp, xfs_attr->tag));
+}
+
+static const struct sysfs_ops xfs_errortag_sysfs_ops = {
+ .show = xfs_errortag_attr_show,
+ .store = xfs_errortag_attr_store,
+};
+
+#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \
+ .tag = (_tag), \
+}
+
+#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
+
+XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR);
+XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1);
+XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2);
+XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3);
+XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4);
+XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5);
+XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6);
+XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF);
+XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK);
+XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK);
+XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF);
+XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI);
+XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP);
+XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK);
+XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE);
+XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE);
+XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK);
+XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR);
+XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR);
+XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR);
+XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR);
+XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT);
+XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT);
+XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
+XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
+XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
+XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
+
+static struct attribute *xfs_errortag_attrs[] = {
+ XFS_ERRORTAG_ATTR_LIST(noerror),
+ XFS_ERRORTAG_ATTR_LIST(iflush1),
+ XFS_ERRORTAG_ATTR_LIST(iflush2),
+ XFS_ERRORTAG_ATTR_LIST(iflush3),
+ XFS_ERRORTAG_ATTR_LIST(iflush4),
+ XFS_ERRORTAG_ATTR_LIST(iflush5),
+ XFS_ERRORTAG_ATTR_LIST(iflush6),
+ XFS_ERRORTAG_ATTR_LIST(dareadbuf),
+ XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
+ XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
+ XFS_ERRORTAG_ATTR_LIST(readagf),
+ XFS_ERRORTAG_ATTR_LIST(readagi),
+ XFS_ERRORTAG_ATTR_LIST(itobp),
+ XFS_ERRORTAG_ATTR_LIST(iunlink),
+ XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
+ XFS_ERRORTAG_ATTR_LIST(dirinovalid),
+ XFS_ERRORTAG_ATTR_LIST(bulkstat),
+ XFS_ERRORTAG_ATTR_LIST(logiodone),
+ XFS_ERRORTAG_ATTR_LIST(stratread),
+ XFS_ERRORTAG_ATTR_LIST(stratcmpl),
+ XFS_ERRORTAG_ATTR_LIST(diowrite),
+ XFS_ERRORTAG_ATTR_LIST(bmapifmt),
+ XFS_ERRORTAG_ATTR_LIST(free_extent),
+ XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
+ XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
+ XFS_ERRORTAG_ATTR_LIST(drop_writes),
+ XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+ NULL,
+};
+
+struct kobj_type xfs_errortag_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_errortag_sysfs_ops,
+ .default_attrs = xfs_errortag_attrs,
+};
+
int
-xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
+xfs_errortag_init(
+ struct xfs_mount *mp)
{
- int i;
- int len;
- int64_t fsid;
+ mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+ KM_SLEEP | KM_MAYFAIL);
+ if (!mp->m_errortag)
+ return -ENOMEM;
- if (error_tag >= XFS_ERRTAG_MAX)
- return -EINVAL;
+ return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+ &mp->m_kobj, "errortag");
+}
- memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+void
+xfs_errortag_del(
+ struct xfs_mount *mp)
+{
+ xfs_sysfs_del(&mp->m_errortag_kobj);
+ kmem_free(mp->m_errortag);
+}
- for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
- if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
- xfs_warn(mp, "error tag #%d on", error_tag);
- return 0;
- }
- }
+bool
+xfs_errortag_test(
+ struct xfs_mount *mp,
+ const char *expression,
+ const char *file,
+ int line,
+ unsigned int error_tag)
+{
+ unsigned int randfactor;
- for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
- if (xfs_etest[i] == 0) {
- xfs_warn(mp, "Turned on XFS error tag #%d",
- error_tag);
- xfs_etest[i] = error_tag;
- xfs_etest_fsid[i] = fsid;
- len = strlen(mp->m_fsname);
- xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
- strcpy(xfs_etest_fsname[i], mp->m_fsname);
- xfs_error_test_active++;
- return 0;
- }
- }
+ /*
+ * To be able to use error injection anywhere, we need to ensure error
+ * injection mechanism is already initialized.
+ *
+ * Code paths like I/O completion can be called before the
+ * initialization is complete, but be able to inject errors in such
+ * places is still useful.
+ */
+ if (!mp->m_errortag)
+ return false;
- xfs_warn(mp, "error tag overflow, too many turned on");
+ ASSERT(error_tag < XFS_ERRTAG_MAX);
+ randfactor = mp->m_errortag[error_tag];
+ if (!randfactor || prandom_u32() % randfactor)
+ return false;
- return 1;
+ xfs_warn_ratelimited(mp,
+"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+ expression, file, line, mp->m_fsname);
+ return true;
}
int
-xfs_errortag_clearall(xfs_mount_t *mp, int loud)
+xfs_errortag_get(
+ struct xfs_mount *mp,
+ unsigned int error_tag)
{
- int64_t fsid;
- int cleared = 0;
- int i;
-
- memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
-
-
- for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
- if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
- xfs_etest[i] != 0) {
- cleared = 1;
- xfs_warn(mp, "Clearing XFS error tag #%d",
- xfs_etest[i]);
- xfs_etest[i] = 0;
- xfs_etest_fsid[i] = 0LL;
- kmem_free(xfs_etest_fsname[i]);
- xfs_etest_fsname[i] = NULL;
- xfs_error_test_active--;
- }
- }
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return -EINVAL;
+
+ return mp->m_errortag[error_tag];
+}
+
+int
+xfs_errortag_set(
+ struct xfs_mount *mp,
+ unsigned int error_tag,
+ unsigned int tag_value)
+{
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return -EINVAL;
- if (loud || cleared)
- xfs_warn(mp, "Cleared all XFS error tags for filesystem");
+ mp->m_errortag[error_tag] = tag_value;
+ return 0;
+}
+int
+xfs_errortag_add(
+ struct xfs_mount *mp,
+ unsigned int error_tag)
+{
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return -EINVAL;
+
+ return xfs_errortag_set(mp, error_tag,
+ xfs_errortag_random_default[error_tag]);
+}
+
+int
+xfs_errortag_clearall(
+ struct xfs_mount *mp)
+{
+ memset(mp->m_errortag, 0, sizeof(unsigned int) * XFS_ERRTAG_MAX);
return 0;
}
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 05f8666733a0..7577be5f09bc 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -96,7 +96,17 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
#define XFS_ERRTAG_BMAP_FINISH_ONE 26
#define XFS_ERRTAG_AG_RESV_CRITICAL 27
-#define XFS_ERRTAG_MAX 28
+/*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are silenty dropped and handled as if they failed.
+ * All delalloc blocks in the range of the write (including pre-existing
+ * delalloc blocks!) are tossed as part of the write failure error
+ * handling sequence.
+ */
+#define XFS_ERRTAG_DROP_WRITES 28
+#define XFS_ERRTAG_LOG_BAD_CRC 29
+#define XFS_ERRTAG_MAX 30
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -129,23 +139,29 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
#define XFS_RANDOM_BMAP_FINISH_ONE 1
#define XFS_RANDOM_AG_RESV_CRITICAL 4
+#define XFS_RANDOM_DROP_WRITES 1
+#define XFS_RANDOM_LOG_BAD_CRC 1
#ifdef DEBUG
-extern int xfs_error_test_active;
-extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
-
-#define XFS_NUM_INJECT_ERROR 10
-#define XFS_TEST_ERROR(expr, mp, tag, rf) \
- ((expr) || (xfs_error_test_active && \
- xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
- (rf))))
+extern int xfs_errortag_init(struct xfs_mount *mp);
+extern void xfs_errortag_del(struct xfs_mount *mp);
+extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
+ const char *file, int line, unsigned int error_tag);
+#define XFS_TEST_ERROR(expr, mp, tag) \
+ ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
-extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
+extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
+extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
+ unsigned int tag_value);
+extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+extern int xfs_errortag_clearall(struct xfs_mount *mp);
#else
-#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
-#define xfs_errortag_add(tag, mp) (ENOSYS)
-#define xfs_errortag_clearall(mp, loud) (ENOSYS)
+#define xfs_errortag_init(mp) (0)
+#define xfs_errortag_del(mp)
+#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define xfs_errortag_set(mp, tag, val) (ENOSYS)
+#define xfs_errortag_add(mp, tag) (ENOSYS)
+#define xfs_errortag_clearall(mp) (ENOSYS)
#endif /* DEBUG */
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 35703a801372..c4893e226fd8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -140,7 +140,7 @@ xfs_file_fsync(
trace_xfs_file_fsync(ip);
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ error = file_write_and_wait_range(file, start, end);
if (error)
return error;
@@ -237,7 +237,11 @@ xfs_file_dax_read(
if (!count)
return 0; /* skip atime */
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -541,7 +545,11 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- xfs_ilock(ip, iolock);
+ if (!xfs_ilock_nowait(ip, iolock)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ xfs_ilock(ip, iolock);
+ }
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
@@ -553,9 +561,15 @@ xfs_file_dio_aio_write(
* otherwise demote the lock if we had to take the exclusive lock
* for other reasons in xfs_file_aio_write_checks.
*/
- if (unaligned_io)
- inode_dio_wait(inode);
- else if (iolock == XFS_IOLOCK_EXCL) {
+ if (unaligned_io) {
+ /* If we are going to wait for other DIO to finish, bail */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (atomic_read(&inode->i_dio_count))
+ return -EAGAIN;
+ } else {
+ inode_dio_wait(inode);
+ }
+ } else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
@@ -585,7 +599,12 @@ xfs_file_dax_write(
size_t count;
loff_t pos;
- xfs_ilock(ip, iolock);
+ if (!xfs_ilock_nowait(ip, iolock)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ xfs_ilock(ip, iolock);
+ }
+
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
@@ -660,6 +679,7 @@ write_retry:
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+ xfs_icache_free_cowblocks(ip->i_mount, &eofb);
goto write_retry;
}
@@ -892,6 +912,7 @@ xfs_file_open(
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
+ file->f_mode |= FMODE_AIO_NOWAIT;
return 0;
}
@@ -950,395 +971,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- return xfs_readdir(ip, ctx, bufsize);
-}
-
-/*
- * This type is designed to indicate the type of offset we would like
- * to search from page cache for xfs_seek_hole_data().
- */
-enum {
- HOLE_OFF = 0,
- DATA_OFF,
-};
-
-/*
- * Lookup the desired type of offset from the given page.
- *
- * On success, return true and the offset argument will point to the
- * start of the region that was found. Otherwise this function will
- * return false and keep the offset argument unchanged.
- */
-STATIC bool
-xfs_lookup_buffer_offset(
- struct page *page,
- loff_t *offset,
- unsigned int type)
-{
- loff_t lastoff = page_offset(page);
- bool found = false;
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- /*
- * Unwritten extents that have data in the page
- * cache covering them can be identified by the
- * BH_Unwritten state flag. Pages with multiple
- * buffers might have a mix of holes, data and
- * unwritten extents - any buffer with valid
- * data in it should have BH_Uptodate flag set
- * on it.
- */
- if (buffer_unwritten(bh) ||
- buffer_uptodate(bh)) {
- if (type == DATA_OFF)
- found = true;
- } else {
- if (type == HOLE_OFF)
- found = true;
- }
-
- if (found) {
- *offset = lastoff;
- break;
- }
- lastoff += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
-
- return found;
-}
-
-/*
- * This routine is called to find out and return a data or hole offset
- * from the page cache for unwritten extents according to the desired
- * type for xfs_seek_hole_data().
- *
- * The argument offset is used to tell where we start to search from the
- * page cache. Map is used to figure out the end points of the range to
- * lookup pages.
- *
- * Return true if the desired type of offset was found, and the argument
- * offset is filled with that address. Otherwise, return false and keep
- * offset unchanged.
- */
-STATIC bool
-xfs_find_get_desired_pgoff(
- struct inode *inode,
- struct xfs_bmbt_irec *map,
- unsigned int type,
- loff_t *offset)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct pagevec pvec;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff = *offset;
- loff_t lastoff = startoff;
- bool found = false;
-
- pagevec_init(&pvec, 0);
-
- index = startoff >> PAGE_SHIFT;
- endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
- end = endoff >> PAGE_SHIFT;
- do {
- int want;
- unsigned nr_pages;
- unsigned int i;
-
- want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- want);
- /*
- * No page mapped into given range. If we are searching holes
- * and if this is the first time we got into the loop, it means
- * that the given offset is landed in a hole, return it.
- *
- * If we have already stepped through some block buffers to find
- * holes but they all contains data. In this case, the last
- * offset is already updated and pointed to the end of the last
- * mapped page, if it does not reach the endpoint to search,
- * that means there should be a hole between them.
- */
- if (nr_pages == 0) {
- /* Data search found nothing */
- if (type == DATA_OFF)
- break;
-
- ASSERT(type == HOLE_OFF);
- if (lastoff == startoff || lastoff < endoff) {
- found = true;
- *offset = lastoff;
- }
- break;
- }
-
- /*
- * At lease we found one page. If this is the first time we
- * step into the loop, and if the first page index offset is
- * greater than the given search offset, a hole was found.
- */
- if (type == HOLE_OFF && lastoff == startoff &&
- lastoff < page_offset(pvec.pages[0])) {
- found = true;
- break;
- }
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- loff_t b_offset;
-
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to tmpfs
- * file mapping. However, page->index will not change
- * because we have a reference on the page.
- *
- * Searching done if the page index is out of range.
- * If the current offset is not reaches the end of
- * the specified search range, there should be a hole
- * between them.
- */
- if (page->index > end) {
- if (type == HOLE_OFF && lastoff < endoff) {
- *offset = lastoff;
- found = true;
- }
- goto out;
- }
-
- lock_page(page);
- /*
- * Page truncated or invalidated(page->mapping == NULL).
- * We can freely skip it and proceed to check the next
- * page.
- */
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- found = xfs_lookup_buffer_offset(page, &b_offset, type);
- if (found) {
- /*
- * The found offset may be less than the start
- * point to search if this is the first time to
- * come here.
- */
- *offset = max_t(loff_t, startoff, b_offset);
- unlock_page(page);
- goto out;
- }
-
- /*
- * We either searching data but nothing was found, or
- * searching hole but found a data buffer. In either
- * case, probably the next page contains the desired
- * things, update the last offset to it so.
- */
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
-
- /*
- * The number of returned pages less than our desired, search
- * done. In this case, nothing was found for searching data,
- * but we found a hole behind the last offset.
- */
- if (nr_pages < want) {
- if (type == HOLE_OFF) {
- *offset = lastoff;
- found = true;
- }
- break;
- }
-
- index = pvec.pages[i - 1]->index + 1;
- pagevec_release(&pvec);
- } while (index <= end);
-
-out:
- pagevec_release(&pvec);
- return found;
-}
-
-/*
- * caller must lock inode with xfs_ilock_data_map_shared,
- * can we craft an appropriate ASSERT?
- *
- * end is because the VFS-level lseek interface is defined such that any
- * offset past i_size shall return -ENXIO, but we use this for quota code
- * which does not maintain i_size, and we want to SEEK_DATA past i_size.
- */
-loff_t
-__xfs_seek_hole_data(
- struct inode *inode,
- loff_t start,
- loff_t end,
- int whence)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- loff_t uninitialized_var(offset);
- xfs_fileoff_t fsbno;
- xfs_filblks_t lastbno;
- int error;
-
- if (start >= end) {
- error = -ENXIO;
- goto out_error;
- }
-
- /*
- * Try to read extents from the first block indicated
- * by fsbno to the end block of the file.
- */
- fsbno = XFS_B_TO_FSBT(mp, start);
- lastbno = XFS_B_TO_FSB(mp, end);
-
- for (;;) {
- struct xfs_bmbt_irec map[2];
- int nmap = 2;
- unsigned int i;
-
- error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
- XFS_BMAPI_ENTIRE);
- if (error)
- goto out_error;
-
- /* No extents at given offset, must be beyond EOF */
- if (nmap == 0) {
- error = -ENXIO;
- goto out_error;
- }
-
- for (i = 0; i < nmap; i++) {
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[i].br_startoff));
-
- /* Landed in the hole we wanted? */
- if (whence == SEEK_HOLE &&
- map[i].br_startblock == HOLESTARTBLOCK)
- goto out;
-
- /* Landed in the data extent we wanted? */
- if (whence == SEEK_DATA &&
- (map[i].br_startblock == DELAYSTARTBLOCK ||
- (map[i].br_state == XFS_EXT_NORM &&
- !isnullstartblock(map[i].br_startblock))))
- goto out;
-
- /*
- * Landed in an unwritten extent, try to search
- * for hole or data from page cache.
- */
- if (map[i].br_state == XFS_EXT_UNWRITTEN) {
- if (xfs_find_get_desired_pgoff(inode, &map[i],
- whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
- &offset))
- goto out;
- }
- }
-
- /*
- * We only received one extent out of the two requested. This
- * means we've hit EOF and didn't find what we are looking for.
- */
- if (nmap == 1) {
- /*
- * If we were looking for a hole, set offset to
- * the end of the file (i.e., there is an implicit
- * hole at the end of any file).
- */
- if (whence == SEEK_HOLE) {
- offset = end;
- break;
- }
- /*
- * If we were looking for data, it's nowhere to be found
- */
- ASSERT(whence == SEEK_DATA);
- error = -ENXIO;
- goto out_error;
- }
-
- ASSERT(i > 1);
-
- /*
- * Nothing was found, proceed to the next round of search
- * if the next reading offset is not at or beyond EOF.
- */
- fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
- start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= end) {
- if (whence == SEEK_HOLE) {
- offset = end;
- break;
- }
- ASSERT(whence == SEEK_DATA);
- error = -ENXIO;
- goto out_error;
- }
- }
-
-out:
- /*
- * If at this point we have found the hole we wanted, the returned
- * offset may be bigger than the file size as it may be aligned to
- * page boundary for unwritten extents. We need to deal with this
- * situation in particular.
- */
- if (whence == SEEK_HOLE)
- offset = min_t(loff_t, offset, end);
-
- return offset;
-
-out_error:
- return error;
-}
-
-STATIC loff_t
-xfs_seek_hole_data(
- struct file *file,
- loff_t start,
- int whence)
-{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- uint lock;
- loff_t offset, end;
- int error = 0;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- lock = xfs_ilock_data_map_shared(ip);
-
- end = i_size_read(inode);
- offset = __xfs_seek_hole_data(inode, start, end, whence);
- if (offset < 0) {
- error = offset;
- goto out_unlock;
- }
-
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-
-out_unlock:
- xfs_iunlock(ip, lock);
-
- if (error)
- return error;
- return offset;
+ return xfs_readdir(NULL, ip, ctx, bufsize);
}
STATIC loff_t
@@ -1347,17 +980,25 @@ xfs_file_llseek(
loff_t offset,
int whence)
{
+ struct inode *inode = file->f_mapping->host;
+
+ if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
+ return -EIO;
+
switch (whence) {
- case SEEK_END:
- case SEEK_CUR:
- case SEEK_SET:
+ default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
+ offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ break;
case SEEK_DATA:
- return xfs_seek_hole_data(file, offset, whence);
- default:
- return -EINVAL;
+ offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ break;
}
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
/*
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3683819887a5..814ed729881d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -828,6 +828,7 @@ xfs_getfsmap(
struct xfs_fsmap dkeys[2]; /* per-dev keys */
struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS];
struct xfs_getfsmap_info info = { NULL };
+ bool use_rmap;
int i;
int error = 0;
@@ -837,12 +838,14 @@ xfs_getfsmap(
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
+ use_rmap = capable(CAP_SYS_ADMIN) &&
+ xfs_sb_version_hasrmapbt(&mp->m_sb);
head->fmh_entries = 0;
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
handlers[0].fn = xfs_getfsmap_datadev_bnobt;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 6ccaae9eb0ee..8f22fc579dbb 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -602,7 +602,7 @@ xfs_growfs_data_private(
if (nagimax)
mp->m_maxagi = nagimax;
if (mp->m_sb.sb_imax_pct) {
- __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+ uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else
@@ -793,17 +793,17 @@ xfs_fs_counts(
int
xfs_reserve_blocks(
xfs_mount_t *mp,
- __uint64_t *inval,
+ uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
- __int64_t lcounter, delta;
- __int64_t fdblks_delta = 0;
- __uint64_t request;
- __int64_t free;
+ int64_t lcounter, delta;
+ int64_t fdblks_delta = 0;
+ uint64_t request;
+ int64_t free;
int error = 0;
/* If inval is null, report current values and return */
- if (inval == (__uint64_t *)NULL) {
+ if (inval == (uint64_t *)NULL) {
if (!outval)
return -EINVAL;
outval->resblks = mp->m_resblks;
@@ -904,7 +904,7 @@ out:
int
xfs_fs_goingdown(
xfs_mount_t *mp,
- __uint32_t inflags)
+ uint32_t inflags)
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f34915898fea..2954c13a3acd 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -22,9 +22,9 @@ extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion);
extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
-extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
+extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
-extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 687a4b01fc53..3e1cc3001bcb 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -47,4 +47,9 @@ xfs_param_t xfs_params = {
struct xfs_globals xfs_globals = {
.log_recovery_delay = 0, /* no delay by default */
+#ifdef XFS_ASSERT_FATAL
+ .bug_on_assert = true, /* assert failures BUG() */
+#else
+ .bug_on_assert = false, /* assert failures WARN() */
+#endif
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f61c84f8e31a..0a9e6985a0d0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -66,7 +66,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
@@ -190,7 +189,7 @@ xfs_perag_set_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (pag->pag_ici_reclaimable++)
return;
@@ -212,7 +211,7 @@ xfs_perag_clear_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (--pag->pag_ici_reclaimable)
return;
@@ -270,12 +269,12 @@ xfs_inew_wait(
DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
do {
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (!xfs_iflags_test(ip, XFS_INEW))
break;
schedule();
} while (true);
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
/*
@@ -369,6 +368,11 @@ xfs_iget_cache_hit(
if (ip->i_flags & XFS_IRECLAIMABLE) {
trace_xfs_iget_reclaim(ip);
+ if (flags & XFS_IGET_INCORE) {
+ error = -EAGAIN;
+ goto out_error;
+ }
+
/*
* We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
* from stomping over us while we recycle the inode. We can't
@@ -433,7 +437,8 @@ xfs_iget_cache_hit(
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ if (!(flags & XFS_IGET_INCORE))
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -604,6 +609,10 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
+ if (flags & XFS_IGET_INCORE) {
+ error = -ENOENT;
+ goto out_error_or_again;
+ }
XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
@@ -624,7 +633,7 @@ again:
return 0;
out_error_or_again:
- if (error == -EAGAIN) {
+ if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
delay(1);
goto again;
}
@@ -633,6 +642,44 @@ out_error_or_again:
}
/*
+ * "Is this a cached inode that's also allocated?"
+ *
+ * Look up an inode by number in the given file system. If the inode is
+ * in cache and isn't in purgatory, return 1 if the inode is allocated
+ * and 0 if it is not. For all other cases (not in cache, being torn
+ * down, etc.), return a negative error code.
+ *
+ * The caller has to prevent inode allocation and freeing activity,
+ * presumably by locking the AGI buffer. This is to ensure that an
+ * inode cannot transition from allocated to freed until the caller is
+ * ready to allow that. If the inode is in an intermediate state (new,
+ * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
+ * inode is not in the cache, -ENOENT will be returned. The caller must
+ * deal with these scenarios appropriately.
+ *
+ * This is a specialized use case for the online scrubber; if you're
+ * reading this, you probably want xfs_iget.
+ */
+int
+xfs_icache_inode_is_allocated(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ bool *inuse)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
+ if (error)
+ return error;
+
+ *inuse = !!(VFS_I(ip)->i_mode);
+ IRELE(ip);
+ return 0;
+}
+
+/*
* The inode lookup is done in batches to keep the amount of lock traffic and
* radix tree lookups to a minimum. The batch size is a trade off between
* lookup reduction and stack usage. This is in the reclaim path, so we can't
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9183f77958ef..bff4d85e5498 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -47,6 +47,7 @@ struct xfs_eofblocks {
#define XFS_IGET_CREATE 0x1
#define XFS_IGET_UNTRUSTED 0x2
#define XFS_IGET_DONTCACHE 0x4
+#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
/*
* flags for AG inode iterator
@@ -126,4 +127,7 @@ xfs_fs_eofblocks_from_user(
return 0;
}
+int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_ino_t ino, bool *inuse);
+
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ec9826c56500..ceef77c0416a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -622,17 +622,17 @@ __xfs_iflock(
DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
do {
- prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (xfs_isiflocked(ip))
io_schedule();
} while (!xfs_iflock_nowait(ip));
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
STATIC uint
_xfs_dic2xflags(
- __uint16_t di_flags,
+ uint16_t di_flags,
uint64_t di_flags2,
bool has_attr)
{
@@ -855,8 +855,8 @@ xfs_ialloc(
inode->i_version = 1;
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
- ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
+ ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
+ ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
}
@@ -2486,11 +2486,11 @@ __xfs_iunpin_wait(
xfs_iunpin(ip);
do {
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (xfs_ipincount(ip))
io_schedule();
} while (xfs_ipincount(ip));
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
void
@@ -3489,7 +3489,7 @@ xfs_iflush_int(
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
- mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+ mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
@@ -3499,7 +3499,7 @@ xfs_iflush_int(
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+ mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad regular inode %Lu, ptr 0x%p",
__func__, ip->i_ino, ip);
@@ -3510,7 +3510,7 @@ xfs_iflush_int(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
- mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+ mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad directory inode %Lu, ptr 0x%p",
__func__, ip->i_ino, ip);
@@ -3518,8 +3518,7 @@ xfs_iflush_int(
}
}
if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
- ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
- XFS_RANDOM_IFLUSH_5)) {
+ ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %Lu, "
"total extents = %d, nblocks = %Ld, ptr 0x%p",
@@ -3529,7 +3528,7 @@ xfs_iflush_int(
goto corrupt_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
- mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+ mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10e89fcb49d7..0ee453de239a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -192,8 +192,8 @@ static inline void
xfs_set_projid(struct xfs_inode *ip,
prid_t projid)
{
- ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
- ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+ ip->i_d.di_projid_hi = (uint16_t) (projid >> 16);
+ ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff);
}
static inline prid_t
@@ -445,9 +445,6 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
bool *did_zero);
-loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
- loff_t eof, int whence);
-
/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 08cb7d1a4a3a..013cc78d7daf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -834,9 +834,7 @@ xfs_inode_item_format_convert(
in_f->ilf_dsize = in_f32->ilf_dsize;
in_f->ilf_ino = in_f32->ilf_ino;
/* copy biggest field of ilf_u */
- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
- in_f32->ilf_u.ilfu_uuid.__u_bits,
- sizeof(uuid_t));
+ uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
in_f->ilf_blkno = in_f32->ilf_blkno;
in_f->ilf_len = in_f32->ilf_len;
in_f->ilf_boffset = in_f32->ilf_boffset;
@@ -851,9 +849,7 @@ xfs_inode_item_format_convert(
in_f->ilf_dsize = in_f64->ilf_dsize;
in_f->ilf_ino = in_f64->ilf_ino;
/* copy biggest field of ilf_u */
- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
- in_f64->ilf_u.ilfu_uuid.__u_bits,
- sizeof(uuid_t));
+ uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid);
in_f->ilf_blkno = in_f64->ilf_blkno;
in_f->ilf_len = in_f64->ilf_len;
in_f->ilf_boffset = in_f64->ilf_boffset;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6190697603c9..9c0c7a920304 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -120,8 +120,7 @@ xfs_find_handle(
handle.ha_fid.fid_pad = 0;
handle.ha_fid.fid_gen = inode->i_generation;
handle.ha_fid.fid_ino = ip->i_ino;
-
- hsize = XFS_HSIZE(handle);
+ hsize = sizeof(xfs_handle_t);
}
error = -EFAULT;
@@ -444,8 +443,8 @@ xfs_attrmulti_attr_get(
struct inode *inode,
unsigned char *name,
unsigned char __user *ubuf,
- __uint32_t *len,
- __uint32_t flags)
+ uint32_t *len,
+ uint32_t flags)
{
unsigned char *kbuf;
int error = -EFAULT;
@@ -473,8 +472,8 @@ xfs_attrmulti_attr_set(
struct inode *inode,
unsigned char *name,
const unsigned char __user *ubuf,
- __uint32_t len,
- __uint32_t flags)
+ uint32_t len,
+ uint32_t flags)
{
unsigned char *kbuf;
int error;
@@ -499,7 +498,7 @@ int
xfs_attrmulti_attr_remove(
struct inode *inode,
unsigned char *name,
- __uint32_t flags)
+ uint32_t flags)
{
int error;
@@ -877,7 +876,7 @@ xfs_merge_ioc_xflags(
STATIC unsigned int
xfs_di2lxflags(
- __uint16_t di_flags)
+ uint16_t di_flags)
{
unsigned int flags = 0;
@@ -1288,7 +1287,7 @@ xfs_ioctl_setattr_check_projid(
struct fsxattr *fa)
{
/* Disallow 32bit project ids if projid32bit feature is not enabled. */
- if (fa->fsx_projid > (__uint16_t)-1 &&
+ if (fa->fsx_projid > (uint16_t)-1 &&
!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
return -EINVAL;
@@ -1932,7 +1931,7 @@ xfs_file_ioctl(
case XFS_IOC_SET_RESBLKS: {
xfs_fsop_resblks_t inout;
- __uint64_t in;
+ uint64_t in;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2018,12 +2017,12 @@ xfs_file_ioctl(
}
case XFS_IOC_GOINGDOWN: {
- __uint32_t in;
+ uint32_t in;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (get_user(in, (__uint32_t __user *)arg))
+ if (get_user(in, (uint32_t __user *)arg))
return -EFAULT;
return xfs_fs_goingdown(mp, in);
@@ -2038,14 +2037,14 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
- return xfs_errortag_add(in.errtag, mp);
+ return xfs_errortag_add(mp, in.errtag);
}
case XFS_IOC_ERROR_CLEARALL:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return xfs_errortag_clearall(mp, 1);
+ return xfs_errortag_clearall(mp);
case XFS_IOC_FREE_EOFBLOCKS: {
struct xfs_fs_eofblocks eofb;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 8b52881bfd90..e86c3ea137d2 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -48,22 +48,22 @@ xfs_attrmulti_attr_get(
struct inode *inode,
unsigned char *name,
unsigned char __user *ubuf,
- __uint32_t *len,
- __uint32_t flags);
+ uint32_t *len,
+ uint32_t flags);
extern int
xfs_attrmulti_attr_set(
struct inode *inode,
unsigned char *name,
const unsigned char __user *ubuf,
- __uint32_t len,
- __uint32_t flags);
+ uint32_t len,
+ uint32_t flags);
extern int
xfs_attrmulti_attr_remove(
struct inode *inode,
unsigned char *name,
- __uint32_t flags);
+ uint32_t flags);
extern struct dentry *
xfs_handle_to_dentry(
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index b1bb45444df8..5492bcf6f442 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -112,9 +112,9 @@ typedef struct compat_xfs_fsop_handlereq {
/* The bstat field in the swapext struct needs translation */
typedef struct compat_xfs_swapext {
- __int64_t sx_version; /* version */
- __int64_t sx_fdtarget; /* fd of target file */
- __int64_t sx_fdtmp; /* fd of tmp file */
+ int64_t sx_version; /* version */
+ int64_t sx_fdtarget; /* fd of target file */
+ int64_t sx_fdtmp; /* fd of tmp file */
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a63f61c256bd..813394c62849 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -543,7 +543,7 @@ xfs_file_iomap_begin_delay(
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_unlock;
@@ -995,6 +995,11 @@ xfs_file_iomap_begin(
lockmode = xfs_ilock_data_map_shared(ip);
}
+ if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
+
ASSERT(offset <= mp->m_super->s_maxbytes);
if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
length = mp->m_super->s_maxbytes - offset;
@@ -1016,6 +1021,15 @@ xfs_file_iomap_begin(
if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
if (flags & IOMAP_DIRECT) {
+ /*
+ * A reflinked inode will result in CoW alloc.
+ * FIXME: It could still overwrite on unshared extents
+ * and not need allocation.
+ */
+ if (flags & IOMAP_NOWAIT) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &shared,
&lockmode);
@@ -1033,6 +1047,14 @@ xfs_file_iomap_begin(
if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
/*
+ * If nowait is set bail since we are going to make
+ * allocations.
+ */
+ if (flags & IOMAP_NOWAIT) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
+ /*
* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
* pages to keep the chunks of work done where somewhat symmetric
* with the work writeback does. This is a completely arbitrary
@@ -1068,7 +1090,7 @@ xfs_file_iomap_begin(
/* optionally associate a dax device with the iomap bdev */
bdev = iomap->bdev;
if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
else
iomap->dax_dev = NULL;
@@ -1097,7 +1119,7 @@ xfs_file_iomap_end_delalloc(
* Behave as if the write failed if drop writes is enabled. Set the NEW
* flag to force delalloc cleanup.
*/
- if (xfs_mp_drop_writes(mp)) {
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
iomap->flags |= IOMAP_F_NEW;
written = 0;
}
@@ -1149,7 +1171,7 @@ xfs_file_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- put_dax(iomap->dax_dev);
+ fs_put_dax(iomap->dax_dev);
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ebfc13350f9a..469c9fa4c178 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -190,12 +190,12 @@ xfs_generic_create(
#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
goto out_cleanup_inode;
}
if (acl) {
- error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
goto out_cleanup_inode;
}
@@ -460,7 +460,7 @@ xfs_vn_get_link(
if (!dentry)
return ERR_PTR(-ECHILD);
- link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+ link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL);
if (!link)
goto out_err;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 26d67ce3c18d..c393a2f6d8c3 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -31,7 +31,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
-STATIC int
+int
xfs_internal_inum(
xfs_mount_t *mp,
xfs_ino_t ino)
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 6ea8b3912fa4..17e86e0541af 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -96,4 +96,6 @@ xfs_inumbers(
void __user *buffer, /* buffer with inode info */
inumbers_fmt_pf formatter);
+int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+
#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 044fb0e15390..9301c5a6060b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -19,18 +19,11 @@
#define __XFS_LINUX__
#include <linux/types.h>
+#include <linux/uuid.h>
/*
* Kernel specific type declarations for XFS
*/
-typedef signed char __int8_t;
-typedef unsigned char __uint8_t;
-typedef signed short int __int16_t;
-typedef unsigned short int __uint16_t;
-typedef signed int __int32_t;
-typedef unsigned int __uint32_t;
-typedef signed long long int __int64_t;
-typedef unsigned long long int __uint64_t;
typedef __s64 xfs_off_t; /* <file offset> type */
typedef unsigned long long xfs_ino_t; /* <inode> type */
@@ -42,7 +35,6 @@ typedef __u32 xfs_nlink_t;
#include "kmem.h"
#include "mrlock.h"
-#include "uuid.h"
#include <linux/semaphore.h>
#include <linux/mm.h>
@@ -151,7 +143,6 @@ typedef __u32 xfs_nlink_t;
#define __return_address __builtin_return_address(0)
#define XFS_PROJID_DEFAULT 0
-#define MAXPATHLEN 1024
#define MIN(a,b) (min(a,b))
#define MAX(a,b) (max(a,b))
@@ -186,22 +177,22 @@ extern struct xstats xfsstats;
* are converting to the init_user_ns. The uid is later mapped to a particular
* user namespace value when crossing the kernel/user boundary.
*/
-static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+static inline uint32_t xfs_kuid_to_uid(kuid_t uid)
{
return from_kuid(&init_user_ns, uid);
}
-static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+static inline kuid_t xfs_uid_to_kuid(uint32_t uid)
{
return make_kuid(&init_user_ns, uid);
}
-static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+static inline uint32_t xfs_kgid_to_gid(kgid_t gid)
{
return from_kgid(&init_user_ns, gid);
}
-static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
{
return make_kgid(&init_user_ns, gid);
}
@@ -231,14 +222,14 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
-static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+static inline uint64_t roundup_64(uint64_t x, uint32_t y)
{
x += y - 1;
do_div(x, y);
return x * y;
}
-static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+static inline uint64_t howmany_64(uint64_t x, uint32_t y)
{
x += y - 1;
do_div(x, y);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3731f13f63e9..0053bcf2b10a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -434,7 +434,7 @@ xfs_log_reserve(
int unit_bytes,
int cnt,
struct xlog_ticket **ticp,
- __uint8_t client,
+ uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -825,9 +825,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (!error) {
/* the data section must be 32 bit size aligned */
struct {
- __uint16_t magic;
- __uint16_t pad1;
- __uint32_t pad2; /* may as well make it 64 bits */
+ uint16_t magic;
+ uint16_t pad1;
+ uint32_t pad2; /* may as well make it 64 bits */
} magic = {
.magic = XLOG_UNMOUNT_TYPE,
};
@@ -1189,8 +1189,7 @@ xlog_iodone(xfs_buf_t *bp)
* IOABORT state. The IOABORT state is only set in DEBUG mode to inject
* CRC errors into log recovery.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR,
- XFS_RANDOM_IODONE_IOERR) ||
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) ||
iclog->ic_state & XLOG_STATE_IOABORT) {
if (iclog->ic_state & XLOG_STATE_IOABORT)
iclog->ic_state &= ~XLOG_STATE_IOABORT;
@@ -1665,7 +1664,7 @@ xlog_cksum(
char *dp,
int size)
{
- __uint32_t crc;
+ uint32_t crc;
/* first generate the crc for the record header ... */
crc = xfs_start_cksum_update((char *)rhead,
@@ -1828,7 +1827,7 @@ xlog_sync(
*/
dptr = (char *)&iclog->ic_header + count;
for (i = 0; i < split; i += BBSIZE) {
- __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+ uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
if (++cycle == XLOG_HEADER_MAGIC_NUM)
cycle++;
*(__be32 *)dptr = cpu_to_be32(cycle);
@@ -1842,7 +1841,6 @@ xlog_sync(
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
iclog->ic_datap, size);
-#ifdef DEBUG
/*
* Intentionally corrupt the log record CRC based on the error injection
* frequency, if defined. This facilitates testing log recovery in the
@@ -1850,15 +1848,13 @@ xlog_sync(
* write on I/O completion and shutdown the fs. The subsequent mount
* detects the bad CRC and attempts to recover.
*/
- if (log->l_badcrc_factor &&
- (prandom_u32() % log->l_badcrc_factor == 0)) {
+ if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_state |= XLOG_STATE_IOABORT;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
be64_to_cpu(iclog->ic_header.h_lsn));
}
-#endif
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
@@ -2024,7 +2020,7 @@ xlog_print_tic_res(
};
#undef REG_TYPE_STR
- xfs_warn(mp, "xlog_write: reservation summary:");
+ xfs_warn(mp, "ticket reservation summary:");
xfs_warn(mp, " unit res = %d bytes",
ticket->t_unit_res);
xfs_warn(mp, " current res = %d bytes",
@@ -2045,10 +2041,55 @@ xlog_print_tic_res(
"bad-rtype" : res_type_str[r_type]),
ticket->t_res_arr[i].r_len);
}
+}
+
+/*
+ * Print a summary of the transaction.
+ */
+void
+xlog_print_trans(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item_desc *lidp;
+
+ /* dump core transaction and ticket info */
+ xfs_warn(mp, "transaction summary:");
+ xfs_warn(mp, " flags = 0x%x", tp->t_flags);
+
+ xlog_print_tic_res(mp, tp->t_ticket);
- xfs_alert_tag(mp, XFS_PTAG_LOGRES,
- "xlog_write: reservation ran out. Need to up reservation");
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ /* dump each log item */
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_vec *lv = lip->li_lv;
+ struct xfs_log_iovec *vec;
+ int i;
+
+ xfs_warn(mp, "log item: ");
+ xfs_warn(mp, " type = 0x%x", lip->li_type);
+ xfs_warn(mp, " flags = 0x%x", lip->li_flags);
+ if (!lv)
+ continue;
+ xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
+ xfs_warn(mp, " size = %d", lv->lv_size);
+ xfs_warn(mp, " bytes = %d", lv->lv_bytes);
+ xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
+
+ /* dump each iovec for the log item */
+ vec = lv->lv_iovecp;
+ for (i = 0; i < lv->lv_niovecs; i++) {
+ int dumplen = min(vec->i_len, 32);
+
+ xfs_warn(mp, " iovec[%d]", i);
+ xfs_warn(mp, " type = 0x%x", vec->i_type);
+ xfs_warn(mp, " len = %d", vec->i_len);
+ xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i);
+ xfs_hex_dump(vec->i_addr, dumplen);
+
+ vec++;
+ }
+ }
}
/*
@@ -2321,8 +2362,12 @@ xlog_write(
if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
ticket->t_curr_res -= sizeof(xlog_op_header_t);
- if (ticket->t_curr_res < 0)
+ if (ticket->t_curr_res < 0) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+ "ctx ticket reservation ran out. Need to up reservation");
xlog_print_tic_res(log->l_mp, ticket);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ }
index = 0;
lv = log_vector;
@@ -2363,8 +2408,8 @@ xlog_write(
}
reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(__int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+ ASSERT(reg->i_len % sizeof(int32_t) == 0);
+ ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
start_rec_copy = xlog_write_start_rec(ptr, ticket);
if (start_rec_copy) {
@@ -3143,7 +3188,7 @@ xlog_state_switch_iclogs(
/* Round up to next log-sunit */
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
log->l_mp->m_sb.sb_logsunit > 1) {
- __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+ uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
}
@@ -3771,7 +3816,7 @@ xlog_verify_iclog(
xlog_in_core_2_t *xhdr;
void *base_ptr, *ptr, *p;
ptrdiff_t field_offset;
- __uint8_t clientid;
+ uint8_t clientid;
int len, i, j, k, op_len;
int idx;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index cc5a9f1574e7..bf212772595c 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -159,7 +159,7 @@ int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
struct xlog_ticket **ticket,
- __uint8_t clientid,
+ uint8_t clientid,
bool permanent);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 82f1cbcc4de1..fbe72b134bef 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -410,6 +410,7 @@ xlog_cil_insert_items(
int len = 0;
int diff_iovecs = 0;
int iclog_space;
+ int iovhdr_res = 0, split_res = 0, ctx_res = 0;
ASSERT(tp);
@@ -419,30 +420,11 @@ xlog_cil_insert_items(
*/
xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
- /*
- * Now (re-)position everything modified at the tail of the CIL.
- * We do this here so we only need to take the CIL lock once during
- * the transaction commit.
- */
spin_lock(&cil->xc_cil_lock);
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
-
- /* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
-
- /*
- * Only move the item if it isn't already at the tail. This is
- * to prevent a transient list_empty() state when reinserting
- * an item that is already the only item in the CIL.
- */
- if (!list_is_last(&lip->li_cil, &cil->xc_cil))
- list_move_tail(&lip->li_cil, &cil->xc_cil);
- }
/* account for space used by new iovec headers */
- len += diff_iovecs * sizeof(xlog_op_header_t);
+ iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
+ len += iovhdr_res;
ctx->nvecs += diff_iovecs;
/* attach the transaction to the CIL if it has any busy extents */
@@ -457,28 +439,66 @@ xlog_cil_insert_items(
* during the transaction commit.
*/
if (ctx->ticket->t_curr_res == 0) {
- ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
- tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
+ ctx_res = ctx->ticket->t_unit_res;
+ ctx->ticket->t_curr_res = ctx_res;
+ tp->t_ticket->t_curr_res -= ctx_res;
}
/* do we need space for more log record headers? */
iclog_space = log->l_iclog_size - log->l_iclog_hsize;
if (len > 0 && (ctx->space_used / iclog_space !=
(ctx->space_used + len) / iclog_space)) {
- int hdrs;
-
- hdrs = (len + iclog_space - 1) / iclog_space;
+ split_res = (len + iclog_space - 1) / iclog_space;
/* need to take into account split region headers, too */
- hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
- ctx->ticket->t_unit_res += hdrs;
- ctx->ticket->t_curr_res += hdrs;
- tp->t_ticket->t_curr_res -= hdrs;
+ split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+ ctx->ticket->t_unit_res += split_res;
+ ctx->ticket->t_curr_res += split_res;
+ tp->t_ticket->t_curr_res -= split_res;
ASSERT(tp->t_ticket->t_curr_res >= len);
}
tp->t_ticket->t_curr_res -= len;
ctx->space_used += len;
+ /*
+ * If we've overrun the reservation, dump the tx details before we move
+ * the log items. Shutdown is imminent...
+ */
+ if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
+ xfs_warn(log->l_mp, "Transaction log reservation overrun:");
+ xfs_warn(log->l_mp,
+ " log items: %d bytes (iov hdrs: %d bytes)",
+ len, iovhdr_res);
+ xfs_warn(log->l_mp, " split region headers: %d bytes",
+ split_res);
+ xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
+ xlog_print_trans(tp);
+ }
+
+ /*
+ * Now (re-)position everything modified at the tail of the CIL.
+ * We do this here so we only need to take the CIL lock once during
+ * the transaction commit.
+ */
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /*
+ * Only move the item if it isn't already at the tail. This is
+ * to prevent a transient list_empty() state when reinserting
+ * an item that is already the only item in the CIL.
+ */
+ if (!list_is_last(&lip->li_cil, &cil->xc_cil))
+ list_move_tail(&lip->li_cil, &cil->xc_cil);
+ }
+
spin_unlock(&cil->xc_cil_lock);
+
+ if (tp->t_ticket->t_curr_res < 0)
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
static void
@@ -973,6 +993,7 @@ xfs_log_commit_cil(
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
+ xfs_lsn_t xc_commit_lsn;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -986,13 +1007,9 @@ xfs_log_commit_cil(
xlog_cil_insert_items(log, tp);
- /* check we didn't blow the reservation */
- if (tp->t_ticket->t_curr_res < 0)
- xlog_print_tic_res(mp, tp->t_ticket);
-
- tp->t_commit_lsn = cil->xc_ctx->sequence;
+ xc_commit_lsn = cil->xc_ctx->sequence;
if (commit_lsn)
- *commit_lsn = tp->t_commit_lsn;
+ *commit_lsn = xc_commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, regrant);
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1008,7 +1025,7 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, tp->t_commit_lsn, false);
+ xfs_trans_free_items(tp, xc_commit_lsn, false);
xlog_cil_push_background(log);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c2604a5366f2..51bf7b827387 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -419,7 +419,7 @@ struct xlog {
};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
- ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
@@ -456,6 +456,7 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
}
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+void xlog_print_trans(struct xfs_trans *);
int
xlog_write(
struct xlog *log,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4a98762ec8b4..9549188f5a36 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -352,13 +352,13 @@ xlog_header_check_mount(
{
ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
- if (uuid_is_nil(&head->h_fs_uuid)) {
+ if (uuid_is_null(&head->h_fs_uuid)) {
/*
* IRIX doesn't write the h_fs_uuid or h_fmt fields. If
- * h_fs_uuid is nil, we assume this log was last mounted
+ * h_fs_uuid is null, we assume this log was last mounted
* by IRIX and continue.
*/
- xfs_warn(mp, "nil uuid in log - IRIX style log");
+ xfs_warn(mp, "null uuid in log - IRIX style log");
} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
xfs_warn(mp, "log has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
@@ -2230,9 +2230,9 @@ xlog_recover_get_buf_lsn(
struct xfs_mount *mp,
struct xfs_buf *bp)
{
- __uint32_t magic32;
- __uint16_t magic16;
- __uint16_t magicda;
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
void *blk = bp->b_addr;
uuid_t *uuid;
xfs_lsn_t lsn = -1;
@@ -2381,9 +2381,9 @@ xlog_recover_validate_buf_type(
xfs_lsn_t current_lsn)
{
struct xfs_da_blkinfo *info = bp->b_addr;
- __uint32_t magic32;
- __uint16_t magic16;
- __uint16_t magicda;
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
char *warnmsg = NULL;
/*
@@ -2852,7 +2852,7 @@ xlog_recover_buffer_pass2(
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
(BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
- (__uint32_t)log->l_mp->m_inode_cluster_size))) {
+ (uint32_t)log->l_mp->m_inode_cluster_size))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
@@ -3423,7 +3423,7 @@ xlog_recover_efd_pass2(
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
xfs_log_item_t *lip;
- __uint64_t efi_id;
+ uint64_t efi_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3519,7 +3519,7 @@ xlog_recover_rud_pass2(
struct xfs_rud_log_format *rud_formatp;
struct xfs_rui_log_item *ruip = NULL;
struct xfs_log_item *lip;
- __uint64_t rui_id;
+ uint64_t rui_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3635,7 +3635,7 @@ xlog_recover_cud_pass2(
struct xfs_cud_log_format *cud_formatp;
struct xfs_cui_log_item *cuip = NULL;
struct xfs_log_item *lip;
- __uint64_t cui_id;
+ uint64_t cui_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3754,7 +3754,7 @@ xlog_recover_bud_pass2(
struct xfs_bud_log_format *bud_formatp;
struct xfs_bui_log_item *buip = NULL;
struct xfs_log_item *lip;
- __uint64_t bui_id;
+ uint64_t bui_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2(
* This routine is called when an inode create format structure is found in a
* committed transaction in the log. It's purpose is to initialise the inodes
* being allocated on disk. This requires us to get inode cluster buffers that
- * match the range to be intialised, stamped with inode templates and written
+ * match the range to be initialised, stamped with inode templates and written
* by delayed write so that subsequent modifications will hit the cached buffer
* and only need writing out at the end of recovery.
*/
@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans(
#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
- hlist_del(&trans->r_list);
+ hlist_del_init(&trans->r_list);
error = xlog_recover_reorder_trans(log, trans, pass);
if (error)
@@ -4354,6 +4354,8 @@ xlog_recover_free_trans(
xlog_recover_item_t *item, *n;
int i;
+ hlist_del_init(&trans->r_list);
+
list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
/* Free the regions in the item. */
list_del(&item->ri_list);
@@ -5224,12 +5226,16 @@ xlog_do_recovery_pass(
int error2 = 0;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
+ int i;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
rhead_blk = 0;
+ for (i = 0; i < XLOG_RHASH_SIZE; i++)
+ INIT_HLIST_HEAD(&rhash[i]);
+
/*
* Read the header of the tail block and get the iclog buffer size from
* h_size. Use this to tell how many sectors make up the log header.
@@ -5466,6 +5472,19 @@ xlog_do_recovery_pass(
if (error && first_bad)
*first_bad = rhead_blk;
+ /*
+ * Transactions are freed at commit time but transactions without commit
+ * records on disk are never committed. Free any that may be left in the
+ * hash table.
+ */
+ for (i = 0; i < XLOG_RHASH_SIZE; i++) {
+ struct hlist_node *tmp;
+ struct xlog_recover *trans;
+
+ hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
+ xlog_recover_free_trans(trans);
+ }
+
return error ? error : error2;
}
@@ -5772,9 +5791,9 @@ xlog_recover_check_summary(
xfs_buf_t *agfbp;
xfs_buf_t *agibp;
xfs_agnumber_t agno;
- __uint64_t freeblks;
- __uint64_t itotal;
- __uint64_t ifree;
+ uint64_t freeblks;
+ uint64_t itotal;
+ uint64_t ifree;
int error;
mp = log->l_mp;
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 11792d888e4e..e68bd1050eab 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -110,7 +110,10 @@ assfail(char *expr, char *file, int line)
{
xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
expr, file, line);
- BUG();
+ if (xfs_globals.bug_on_assert)
+ BUG();
+ else
+ WARN_ON(1);
}
void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2eaf81859166..40d4e8b4e193 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -74,20 +74,19 @@ xfs_uuid_mount(
int hole, i;
/* Publish UUID in struct super_block */
- BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t));
- memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t));
+ uuid_copy(&mp->m_super->s_uuid, uuid);
if (mp->m_flags & XFS_MOUNT_NOUUID)
return 0;
- if (uuid_is_nil(uuid)) {
- xfs_warn(mp, "Filesystem has nil UUID - can't mount");
+ if (uuid_is_null(uuid)) {
+ xfs_warn(mp, "Filesystem has null UUID - can't mount");
return -EINVAL;
}
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
- if (uuid_is_nil(&xfs_uuid_table[i])) {
+ if (uuid_is_null(&xfs_uuid_table[i])) {
hole = i;
continue;
}
@@ -124,7 +123,7 @@ xfs_uuid_unmount(
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0; i < xfs_uuid_table_size; i++) {
- if (uuid_is_nil(&xfs_uuid_table[i]))
+ if (uuid_is_null(&xfs_uuid_table[i]))
continue;
if (!uuid_equal(uuid, &xfs_uuid_table[i]))
continue;
@@ -174,7 +173,7 @@ xfs_free_perag(
int
xfs_sb_validate_fsb_count(
xfs_sb_t *sbp,
- __uint64_t nblocks)
+ uint64_t nblocks)
{
ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
ASSERT(sbp->sb_blocklog >= BBSHIFT);
@@ -436,7 +435,7 @@ STATIC void
xfs_set_maxicount(xfs_mount_t *mp)
{
xfs_sb_t *sbp = &(mp->m_sb);
- __uint64_t icount;
+ uint64_t icount;
if (sbp->sb_imax_pct) {
/*
@@ -502,7 +501,7 @@ xfs_set_low_space_thresholds(
int i;
for (i = 0; i < XFS_LOWSP_MAX; i++) {
- __uint64_t space = mp->m_sb.sb_dblocks;
+ uint64_t space = mp->m_sb.sb_dblocks;
do_div(space, 100);
mp->m_low_space[i] = space * (i + 1);
@@ -598,10 +597,10 @@ xfs_mount_reset_sbqflags(
return xfs_sync_sb(mp, false);
}
-__uint64_t
+uint64_t
xfs_default_resblks(xfs_mount_t *mp)
{
- __uint64_t resblks;
+ uint64_t resblks;
/*
* We default to 5% or 8192 fsbs of space reserved, whichever is
@@ -612,7 +611,7 @@ xfs_default_resblks(xfs_mount_t *mp)
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
- resblks = min_t(__uint64_t, resblks, 8192);
+ resblks = min_t(uint64_t, resblks, 8192);
return resblks;
}
@@ -632,7 +631,7 @@ xfs_mountfs(
{
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
- __uint64_t resblks;
+ uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
@@ -720,10 +719,13 @@ xfs_mountfs(
if (error)
goto out_del_stats;
+ error = xfs_errortag_init(mp);
+ if (error)
+ goto out_remove_error_sysfs;
error = xfs_uuid_mount(mp);
if (error)
- goto out_remove_error_sysfs;
+ goto out_remove_errortag;
/*
* Set the minimum read and write sizes
@@ -793,7 +795,10 @@ xfs_mountfs(
* Copies the low order bits of the timestamp and the randomly
* set "sequence" number out of a UUID.
*/
- uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+ mp->m_fixedfsid[0] =
+ (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
+ get_unaligned_be16(&sbp->sb_uuid.b[4]);
+ mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
mp->m_dmevmask = 0; /* not persistent; set after each mount */
@@ -1042,6 +1047,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_remove_errortag:
+ xfs_errortag_del(mp);
out_remove_error_sysfs:
xfs_error_sysfs_del(mp);
out_del_stats:
@@ -1060,7 +1067,7 @@ void
xfs_unmountfs(
struct xfs_mount *mp)
{
- __uint64_t resblks;
+ uint64_t resblks;
int error;
cancel_delayed_work_sync(&mp->m_eofblocks_work);
@@ -1145,10 +1152,11 @@ xfs_unmountfs(
xfs_uuid_unmount(mp);
#if defined(DEBUG)
- xfs_errortag_clearall(mp, 0);
+ xfs_errortag_clearall(mp);
#endif
xfs_free_perag(mp);
+ xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
@@ -1209,7 +1217,7 @@ xfs_mod_icount(
struct xfs_mount *mp,
int64_t delta)
{
- __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
+ percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
ASSERT(0);
percpu_counter_add(&mp->m_icount, -delta);
@@ -1288,7 +1296,7 @@ xfs_mod_fdblocks(
else
batch = XFS_FDBLOCKS_BATCH;
- __percpu_counter_add(&mp->m_fdblocks, delta, batch);
+ percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9fa312a41c93..e0792d036be2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -108,10 +108,10 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
- __uint8_t m_blkbit_log; /* blocklog + NBBY */
- __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
- __uint8_t m_agno_log; /* log #ag's */
- __uint8_t m_agino_log; /* #bits for agino in inum */
+ uint8_t m_blkbit_log; /* blocklog + NBBY */
+ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
+ uint8_t m_agno_log; /* log #ag's */
+ uint8_t m_agino_log; /* #bits for agino in inum */
uint m_inode_cluster_size;/* min inode buf size */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
@@ -139,7 +139,7 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
- __uint64_t m_flags; /* global mount flags */
+ uint64_t m_flags; /* global mount flags */
bool m_inotbt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
@@ -148,14 +148,14 @@ typedef struct xfs_mount {
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
- __uint64_t m_maxicount; /* maximum inode count */
- __uint64_t m_resblks; /* total reserved blocks */
- __uint64_t m_resblks_avail;/* available reserved blocks */
- __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
+ uint64_t m_maxicount; /* maximum inode count */
+ uint64_t m_resblks; /* total reserved blocks */
+ uint64_t m_resblks_avail;/* available reserved blocks */
+ uint64_t m_resblks_save; /* reserved blks @ remount,ro */
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
- __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
@@ -194,19 +194,17 @@ typedef struct xfs_mount {
* ever support shrinks it would have to be persisted in addition
* to various other kinds of pain inflicted on the pNFS server.
*/
- __uint32_t m_generation;
+ uint32_t m_generation;
bool m_fail_unmount;
#ifdef DEBUG
/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
+ * Frequency with which errors are injected. Replaces xfs_etest; the
+ * value stored in here is the inverse of the frequency with which the
+ * error triggers. 1 = always, 2 = half the time, etc.
*/
- bool m_drop_writes;
+ unsigned int *m_errortag;
+ struct xfs_kobj m_errortag_kobj;
#endif
} xfs_mount_t;
@@ -325,20 +323,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-#ifdef DEBUG
-static inline bool
-xfs_mp_drop_writes(struct xfs_mount *mp)
-{
- return mp->m_drop_writes;
-}
-#else
-static inline bool
-xfs_mp_drop_writes(struct xfs_mount *mp)
-{
- return 0;
-}
-#endif
-
/* per-AG block reservation data structures*/
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
@@ -367,12 +351,12 @@ typedef struct xfs_perag {
char pagi_init; /* this agi's entry is initialized */
char pagf_metadata; /* the agf is preferred to be metadata */
char pagi_inodeok; /* The agi is ok for inodes */
- __uint8_t pagf_levels[XFS_BTNUM_AGF];
+ uint8_t pagf_levels[XFS_BTNUM_AGF];
/* # of levels in bno & cnt btree */
- __uint32_t pagf_flcount; /* count of blocks in freelist */
+ uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
- __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
xfs_agino_t pagi_freecount; /* number of free inodes */
xfs_agino_t pagi_count; /* number of allocated inodes */
@@ -411,7 +395,7 @@ typedef struct xfs_perag {
struct xfs_ag_resv pag_agfl_resv;
/* reference count */
- __uint8_t pagf_refcount_level;
+ uint8_t pagf_refcount_level;
} xfs_perag_t;
static inline struct xfs_ag_resv *
@@ -434,7 +418,7 @@ void xfs_buf_hash_destroy(xfs_perag_t *pag);
extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
-extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
xfs_agnumber_t *maxagi);
@@ -450,7 +434,7 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
-extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+extern int xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5fe6e70b88ef..6ce948c436d5 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1247,6 +1247,7 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
+ struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
@@ -1257,7 +1258,32 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- xfs_dqflock(dqp);
+ /*
+ * The only way the dquot is already flush locked by the time quotacheck
+ * gets here is if reclaim flushed it before the dqadjust walk dirtied
+ * it for the final time. Quotacheck collects all dquot bufs in the
+ * local delwri queue before dquots are dirtied, so reclaim can't have
+ * possibly queued it for I/O. The only way out is to push the buffer to
+ * cycle the flush lock.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ /* buf is pinned in-core by delwri list */
+ DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen);
+ bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+ if (!bp) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+ xfs_buf_unlock(bp);
+
+ xfs_buf_delwri_pushbuf(bp, buffer_list);
+ xfs_buf_rele(bp);
+
+ error = -EAGAIN;
+ goto out_unlock;
+ }
+
error = xfs_qm_dqflush(dqp, &bp);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 3e52d5de7ae1..2be6d2735ca9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -33,7 +33,7 @@ xfs_fill_statvfs_from_dquot(
struct kstatfs *statp,
struct xfs_dquot *dqp)
{
- __uint64_t limit;
+ uint64_t limit;
limit = dqp->q_core.d_blk_softlimit ?
be64_to_cpu(dqp->q_core.d_blk_softlimit) :
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index f82d79a8c694..de9493253edf 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -269,7 +269,6 @@ xfs_fs_get_nextdqblk(
/* ID may be different, so convert back what we got */
*qid = make_kqid(current_user_ns(), qid->type, id);
return 0;
-
}
STATIC int
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ffe6fe7a7eb5..ab2270a87196 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -155,6 +155,7 @@
int
xfs_reflink_find_shared(
struct xfs_mount *mp,
+ struct xfs_trans *tp,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
@@ -166,18 +167,18 @@ xfs_reflink_find_shared(
struct xfs_btree_cur *cur;
int error;
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- xfs_buf_relse(agbp);
+ xfs_trans_brelse(tp, agbp);
return error;
}
@@ -217,7 +218,7 @@ xfs_reflink_trim_around_shared(
agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
aglen = irec->br_blockcount;
- error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
+ error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
aglen, &fbno, &flen, true);
if (error)
return error;
@@ -1373,8 +1374,8 @@ xfs_reflink_dirty_extents(
agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
aglen = map[1].br_blockcount;
- error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
- &rbno, &rlen, true);
+ error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
+ aglen, &rbno, &rlen, true);
if (error)
goto out;
if (rbno == NULLAGBLOCK)
@@ -1405,57 +1406,73 @@ out:
return error;
}
-/* Clear the inode reflink flag if there are no shared extents. */
+/* Does this inode need the reflink flag? */
int
-xfs_reflink_clear_inode_flag(
- struct xfs_inode *ip,
- struct xfs_trans **tpp)
+xfs_reflink_inode_has_shared_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ bool *has_shared)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t fbno;
- xfs_filblks_t end;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
- struct xfs_bmbt_irec map;
- int nmaps;
- int error = 0;
-
- ASSERT(xfs_is_reflink_inode(ip));
+ struct xfs_bmbt_irec got;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+ xfs_extnum_t idx;
+ bool found;
+ int error;
- fbno = 0;
- end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
- while (end - fbno > 0) {
- nmaps = 1;
- /*
- * Look for extents in the file. Skip holes, delalloc, or
- * unwritten extents; they can't be reflinked.
- */
- error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
return error;
- if (nmaps == 0)
- break;
- if (!xfs_bmap_is_real_extent(&map))
- goto next;
+ }
- agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock);
- aglen = map.br_blockcount;
+ *has_shared = false;
+ found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got);
+ while (found) {
+ if (isnullstartblock(got.br_startblock) ||
+ got.br_state != XFS_EXT_NORM)
+ goto next;
+ agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
+ aglen = got.br_blockcount;
- error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+ error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
&rbno, &rlen, false);
if (error)
return error;
/* Is there still a shared block here? */
- if (rbno != NULLAGBLOCK)
+ if (rbno != NULLAGBLOCK) {
+ *has_shared = true;
return 0;
+ }
next:
- fbno = map.br_startoff + map.br_blockcount;
+ found = xfs_iext_get_extent(ifp, ++idx, &got);
}
+ return 0;
+}
+
+/* Clear the inode reflink flag if there are no shared extents. */
+int
+xfs_reflink_clear_inode_flag(
+ struct xfs_inode *ip,
+ struct xfs_trans **tpp)
+{
+ bool needs_flag;
+ int error = 0;
+
+ ASSERT(xfs_is_reflink_inode(ip));
+
+ error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
+ if (error || needs_flag)
+ return error;
+
/*
* We didn't find any shared blocks so turn off the reflink flag.
* First, get rid of any leftover CoW mappings.
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index d29a7967f029..701487bab468 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -20,9 +20,9 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
-extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
- xfs_extlen_t *flen, bool find_maximal);
+extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
+ xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
@@ -47,6 +47,8 @@ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
+ struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c57aa7f18087..91472193643b 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1256,13 +1256,13 @@ xfs_rtpick_extent(
{
xfs_rtblock_t b; /* result block */
int log2; /* log of sequence number */
- __uint64_t resid; /* residual after log removed */
- __uint64_t seq; /* sequence number of file creation */
- __uint64_t *seqp; /* pointer to seqno in inode */
+ uint64_t resid; /* residual after log removed */
+ uint64_t seq; /* sequence number of file creation */
+ uint64_t *seqp; /* pointer to seqno in inode */
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
+ seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
*seqp = 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f13133e6f19f..79defa722bf1 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -107,6 +107,8 @@ xfs_growfs_rt(
/*
* From xfs_rtbitmap.c
*/
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len, int val,
xfs_rtblock_t *new, int *stat);
@@ -143,6 +145,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp,
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f11282c96887..056e12b421eb 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -33,9 +33,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
int len = 0;
- __uint64_t xs_xstrat_bytes = 0;
- __uint64_t xs_write_bytes = 0;
- __uint64_t xs_read_bytes = 0;
+ uint64_t xs_xstrat_bytes = 0;
+ uint64_t xs_write_bytes = 0;
+ uint64_t xs_read_bytes = 0;
static const struct xstats_entry {
char *desc;
@@ -100,7 +100,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
int c;
- __uint32_t vn_active;
+ uint32_t vn_active;
xfs_notice(NULL, "Clearing xfsstats");
for_each_possible_cpu(c) {
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 375840f5a99a..f64d0ae345c4 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -54,125 +54,125 @@ enum {
*/
struct __xfsstats {
# define XFSSTAT_END_EXTENT_ALLOC 4
- __uint32_t xs_allocx;
- __uint32_t xs_allocb;
- __uint32_t xs_freex;
- __uint32_t xs_freeb;
+ uint32_t xs_allocx;
+ uint32_t xs_allocb;
+ uint32_t xs_freex;
+ uint32_t xs_freeb;
# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
- __uint32_t xs_abt_lookup;
- __uint32_t xs_abt_compare;
- __uint32_t xs_abt_insrec;
- __uint32_t xs_abt_delrec;
+ uint32_t xs_abt_lookup;
+ uint32_t xs_abt_compare;
+ uint32_t xs_abt_insrec;
+ uint32_t xs_abt_delrec;
# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
- __uint32_t xs_blk_mapr;
- __uint32_t xs_blk_mapw;
- __uint32_t xs_blk_unmap;
- __uint32_t xs_add_exlist;
- __uint32_t xs_del_exlist;
- __uint32_t xs_look_exlist;
- __uint32_t xs_cmp_exlist;
+ uint32_t xs_blk_mapr;
+ uint32_t xs_blk_mapw;
+ uint32_t xs_blk_unmap;
+ uint32_t xs_add_exlist;
+ uint32_t xs_del_exlist;
+ uint32_t xs_look_exlist;
+ uint32_t xs_cmp_exlist;
# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
- __uint32_t xs_bmbt_lookup;
- __uint32_t xs_bmbt_compare;
- __uint32_t xs_bmbt_insrec;
- __uint32_t xs_bmbt_delrec;
+ uint32_t xs_bmbt_lookup;
+ uint32_t xs_bmbt_compare;
+ uint32_t xs_bmbt_insrec;
+ uint32_t xs_bmbt_delrec;
# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
- __uint32_t xs_dir_lookup;
- __uint32_t xs_dir_create;
- __uint32_t xs_dir_remove;
- __uint32_t xs_dir_getdents;
+ uint32_t xs_dir_lookup;
+ uint32_t xs_dir_create;
+ uint32_t xs_dir_remove;
+ uint32_t xs_dir_getdents;
# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
- __uint32_t xs_trans_sync;
- __uint32_t xs_trans_async;
- __uint32_t xs_trans_empty;
+ uint32_t xs_trans_sync;
+ uint32_t xs_trans_async;
+ uint32_t xs_trans_empty;
# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
- __uint32_t xs_ig_attempts;
- __uint32_t xs_ig_found;
- __uint32_t xs_ig_frecycle;
- __uint32_t xs_ig_missed;
- __uint32_t xs_ig_dup;
- __uint32_t xs_ig_reclaims;
- __uint32_t xs_ig_attrchg;
+ uint32_t xs_ig_attempts;
+ uint32_t xs_ig_found;
+ uint32_t xs_ig_frecycle;
+ uint32_t xs_ig_missed;
+ uint32_t xs_ig_dup;
+ uint32_t xs_ig_reclaims;
+ uint32_t xs_ig_attrchg;
# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
- __uint32_t xs_log_writes;
- __uint32_t xs_log_blocks;
- __uint32_t xs_log_noiclogs;
- __uint32_t xs_log_force;
- __uint32_t xs_log_force_sleep;
+ uint32_t xs_log_writes;
+ uint32_t xs_log_blocks;
+ uint32_t xs_log_noiclogs;
+ uint32_t xs_log_force;
+ uint32_t xs_log_force_sleep;
# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
- __uint32_t xs_try_logspace;
- __uint32_t xs_sleep_logspace;
- __uint32_t xs_push_ail;
- __uint32_t xs_push_ail_success;
- __uint32_t xs_push_ail_pushbuf;
- __uint32_t xs_push_ail_pinned;
- __uint32_t xs_push_ail_locked;
- __uint32_t xs_push_ail_flushing;
- __uint32_t xs_push_ail_restarts;
- __uint32_t xs_push_ail_flush;
+ uint32_t xs_try_logspace;
+ uint32_t xs_sleep_logspace;
+ uint32_t xs_push_ail;
+ uint32_t xs_push_ail_success;
+ uint32_t xs_push_ail_pushbuf;
+ uint32_t xs_push_ail_pinned;
+ uint32_t xs_push_ail_locked;
+ uint32_t xs_push_ail_flushing;
+ uint32_t xs_push_ail_restarts;
+ uint32_t xs_push_ail_flush;
# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
- __uint32_t xs_xstrat_quick;
- __uint32_t xs_xstrat_split;
+ uint32_t xs_xstrat_quick;
+ uint32_t xs_xstrat_split;
# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
- __uint32_t xs_write_calls;
- __uint32_t xs_read_calls;
+ uint32_t xs_write_calls;
+ uint32_t xs_read_calls;
# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
- __uint32_t xs_attr_get;
- __uint32_t xs_attr_set;
- __uint32_t xs_attr_remove;
- __uint32_t xs_attr_list;
+ uint32_t xs_attr_get;
+ uint32_t xs_attr_set;
+ uint32_t xs_attr_remove;
+ uint32_t xs_attr_list;
# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
- __uint32_t xs_iflush_count;
- __uint32_t xs_icluster_flushcnt;
- __uint32_t xs_icluster_flushinode;
+ uint32_t xs_iflush_count;
+ uint32_t xs_icluster_flushcnt;
+ uint32_t xs_icluster_flushinode;
# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
- __uint32_t vn_active; /* # vnodes not on free lists */
- __uint32_t vn_alloc; /* # times vn_alloc called */
- __uint32_t vn_get; /* # times vn_get called */
- __uint32_t vn_hold; /* # times vn_hold called */
- __uint32_t vn_rele; /* # times vn_rele called */
- __uint32_t vn_reclaim; /* # times vn_reclaim called */
- __uint32_t vn_remove; /* # times vn_remove called */
- __uint32_t vn_free; /* # times vn_free called */
+ uint32_t vn_active; /* # vnodes not on free lists */
+ uint32_t vn_alloc; /* # times vn_alloc called */
+ uint32_t vn_get; /* # times vn_get called */
+ uint32_t vn_hold; /* # times vn_hold called */
+ uint32_t vn_rele; /* # times vn_rele called */
+ uint32_t vn_reclaim; /* # times vn_reclaim called */
+ uint32_t vn_remove; /* # times vn_remove called */
+ uint32_t vn_free; /* # times vn_free called */
#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
- __uint32_t xb_get;
- __uint32_t xb_create;
- __uint32_t xb_get_locked;
- __uint32_t xb_get_locked_waited;
- __uint32_t xb_busy_locked;
- __uint32_t xb_miss_locked;
- __uint32_t xb_page_retries;
- __uint32_t xb_page_found;
- __uint32_t xb_get_read;
+ uint32_t xb_get;
+ uint32_t xb_create;
+ uint32_t xb_get_locked;
+ uint32_t xb_get_locked_waited;
+ uint32_t xb_busy_locked;
+ uint32_t xb_miss_locked;
+ uint32_t xb_page_retries;
+ uint32_t xb_page_found;
+ uint32_t xb_get_read;
/* Version 2 btree counters */
#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
- __uint32_t xs_abtb_2[__XBTS_MAX];
+ uint32_t xs_abtb_2[__XBTS_MAX];
#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
- __uint32_t xs_abtc_2[__XBTS_MAX];
+ uint32_t xs_abtc_2[__XBTS_MAX];
#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
- __uint32_t xs_bmbt_2[__XBTS_MAX];
+ uint32_t xs_bmbt_2[__XBTS_MAX];
#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
- __uint32_t xs_ibt_2[__XBTS_MAX];
+ uint32_t xs_ibt_2[__XBTS_MAX];
#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
- __uint32_t xs_fibt_2[__XBTS_MAX];
+ uint32_t xs_fibt_2[__XBTS_MAX];
#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
- __uint32_t xs_rmap_2[__XBTS_MAX];
+ uint32_t xs_rmap_2[__XBTS_MAX];
#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
- __uint32_t xs_refcbt_2[__XBTS_MAX];
+ uint32_t xs_refcbt_2[__XBTS_MAX];
#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
- __uint32_t xs_qm_dqreclaims;
- __uint32_t xs_qm_dqreclaim_misses;
- __uint32_t xs_qm_dquot_dups;
- __uint32_t xs_qm_dqcachemisses;
- __uint32_t xs_qm_dqcachehits;
- __uint32_t xs_qm_dqwants;
+ uint32_t xs_qm_dqreclaims;
+ uint32_t xs_qm_dqreclaim_misses;
+ uint32_t xs_qm_dquot_dups;
+ uint32_t xs_qm_dqcachemisses;
+ uint32_t xs_qm_dqcachehits;
+ uint32_t xs_qm_dqwants;
#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
- __uint32_t xs_qm_dquot;
- __uint32_t xs_qm_dquot_unused;
+ uint32_t xs_qm_dquot;
+ uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
- __uint64_t xs_xstrat_bytes;
- __uint64_t xs_write_bytes;
- __uint64_t xs_read_bytes;
+ uint64_t xs_xstrat_bytes;
+ uint64_t xs_write_bytes;
+ uint64_t xs_read_bytes;
};
struct xfsstats {
@@ -186,7 +186,7 @@ struct xfsstats {
* simple wrapper for getting the array index of s struct member offset
*/
#define XFS_STATS_CALC_INDEX(member) \
- (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t))
+ (offsetof(struct __xfsstats, member) / (int)sizeof(uint32_t))
int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 47d239dcf3f4..38aaacdbb8b3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -52,6 +52,7 @@
#include "xfs_reflink.h"
#include <linux/namei.h>
+#include <linux/dax.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/mount.h>
@@ -195,7 +196,7 @@ xfs_parseargs(
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
- __uint8_t iosizelog = 0;
+ uint8_t iosizelog = 0;
/*
* set up the mount name first so all the errors will refer to the
@@ -555,7 +556,7 @@ xfs_showargs(
return 0;
}
-static __uint64_t
+static uint64_t
xfs_max_file_offset(
unsigned int blockshift)
{
@@ -586,7 +587,7 @@ xfs_max_file_offset(
# endif
#endif
- return (((__uint64_t)pagefactor) << bitshift) - 1;
+ return (((uint64_t)pagefactor) << bitshift) - 1;
}
/*
@@ -621,7 +622,7 @@ xfs_set_inode_alloc(
* the max inode percentage. Used only for inode32.
*/
if (mp->m_maxicount) {
- __uint64_t icount;
+ uint64_t icount;
icount = sbp->sb_dblocks * sbp->sb_imax_pct;
do_div(icount, 100);
@@ -1087,12 +1088,12 @@ xfs_fs_statfs(
struct xfs_mount *mp = XFS_M(dentry->d_sb);
xfs_sb_t *sbp = &mp->m_sb;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
- __uint64_t fakeinos, id;
- __uint64_t icount;
- __uint64_t ifree;
- __uint64_t fdblocks;
+ uint64_t fakeinos, id;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
xfs_extlen_t lsize;
- __int64_t ffree;
+ int64_t ffree;
statp->f_type = XFS_SB_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -1115,7 +1116,7 @@ xfs_fs_statfs(
statp->f_bavail = statp->f_bfree;
fakeinos = statp->f_bfree << sbp->sb_inopblog;
- statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+ statp->f_files = MIN(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
if (mp->m_maxicount)
statp->f_files = min_t(typeof(statp->f_files),
statp->f_files,
@@ -1128,7 +1129,7 @@ xfs_fs_statfs(
/* make sure statp->f_ffree does not underflow */
ffree = statp->f_files - (icount - ifree);
- statp->f_ffree = max_t(__int64_t, ffree, 0);
+ statp->f_ffree = max_t(int64_t, ffree, 0);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
@@ -1141,7 +1142,7 @@ xfs_fs_statfs(
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
{
- __uint64_t resblks = 0;
+ uint64_t resblks = 0;
mp->m_resblks_save = mp->m_resblks;
xfs_reserve_blocks(mp, &resblks, NULL);
@@ -1150,7 +1151,7 @@ xfs_save_resvblks(struct xfs_mount *mp)
STATIC void
xfs_restore_resvblks(struct xfs_mount *mp)
{
- __uint64_t resblks;
+ uint64_t resblks;
if (mp->m_resblks_save) {
resblks = mp->m_resblks_save;
@@ -1765,7 +1766,8 @@ STATIC int __init
xfs_init_zones(void)
{
xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
- offsetof(struct xfs_ioend, io_inline_bio));
+ offsetof(struct xfs_ioend, io_inline_bio),
+ BIOSET_NEED_BVECS);
if (!xfs_ioend_bioset)
goto out;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f2cb45ed1d54..12cd9cf7de41 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -43,8 +43,8 @@
#include "xfs_log.h"
/* ----- Kernel only functions below ----- */
-STATIC int
-xfs_readlink_bmap(
+int
+xfs_readlink_bmap_ilocked(
struct xfs_inode *ip,
char *link)
{
@@ -143,7 +143,7 @@ xfs_readlink(
if (!pathlen)
goto out;
- if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
@@ -153,7 +153,7 @@ xfs_readlink(
}
- error = xfs_readlink_bmap(ip, link);
+ error = xfs_readlink_bmap_ilocked(ip, link);
out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -202,7 +202,7 @@ xfs_symlink(
* Check component lengths of the target path name.
*/
pathlen = strlen(target_path);
- if (pathlen >= MAXPATHLEN) /* total string too long */
+ if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */
return -ENAMETOOLONG;
udqp = gdqp = NULL;
@@ -559,7 +559,7 @@ xfs_inactive_symlink(
return 0;
}
- if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index e75245d09116..aeaee8923617 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -21,6 +21,7 @@
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 984a3499cfe3..82afee005140 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -95,6 +95,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
+ bool bug_on_assert; /* BUG() the kernel on assert failure */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 80ac15fb9638..8b2ccc234f36 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -90,15 +90,25 @@ to_mp(struct kobject *kobject)
return container_of(kobj, struct xfs_mount, m_kobj);
}
+static struct attribute *xfs_mp_attrs[] = {
+ NULL,
+};
+
+struct kobj_type xfs_mp_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_mp_attrs,
+};
+
#ifdef DEBUG
+/* debug */
STATIC ssize_t
-drop_writes_store(
+bug_on_assert_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
- struct xfs_mount *mp = to_mp(kobject);
int ret;
int val;
@@ -107,9 +117,9 @@ drop_writes_store(
return ret;
if (val == 1)
- mp->m_drop_writes = true;
+ xfs_globals.bug_on_assert = true;
else if (val == 0)
- mp->m_drop_writes = false;
+ xfs_globals.bug_on_assert = false;
else
return -EINVAL;
@@ -117,33 +127,13 @@ drop_writes_store(
}
STATIC ssize_t
-drop_writes_show(
+bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
- struct xfs_mount *mp = to_mp(kobject);
-
- return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0);
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0);
}
-XFS_SYSFS_ATTR_RW(drop_writes);
-
-#endif /* DEBUG */
-
-static struct attribute *xfs_mp_attrs[] = {
-#ifdef DEBUG
- ATTR_LIST(drop_writes),
-#endif
- NULL,
-};
-
-struct kobj_type xfs_mp_ktype = {
- .release = xfs_sysfs_release,
- .sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_mp_attrs,
-};
-
-#ifdef DEBUG
-/* debug */
+XFS_SYSFS_ATTR_RW(bug_on_assert);
STATIC ssize_t
log_recovery_delay_store(
@@ -176,6 +166,7 @@ log_recovery_delay_show(
XFS_SYSFS_ATTR_RW(log_recovery_delay);
static struct attribute *xfs_dbg_attrs[] = {
+ ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
NULL,
};
@@ -314,47 +305,11 @@ write_grant_head_show(
}
XFS_SYSFS_ATTR_RO(write_grant_head);
-#ifdef DEBUG
-STATIC ssize_t
-log_badcrc_factor_store(
- struct kobject *kobject,
- const char *buf,
- size_t count)
-{
- struct xlog *log = to_xlog(kobject);
- int ret;
- uint32_t val;
-
- ret = kstrtouint(buf, 0, &val);
- if (ret)
- return ret;
-
- log->l_badcrc_factor = val;
-
- return count;
-}
-
-STATIC ssize_t
-log_badcrc_factor_show(
- struct kobject *kobject,
- char *buf)
-{
- struct xlog *log = to_xlog(kobject);
-
- return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor);
-}
-
-XFS_SYSFS_ATTR_RW(log_badcrc_factor);
-#endif /* DEBUG */
-
static struct attribute *xfs_log_attrs[] = {
ATTR_LIST(log_head_lsn),
ATTR_LIST(log_tail_lsn),
ATTR_LIST(reserve_grant_head),
ATTR_LIST(write_grant_head),
-#ifdef DEBUG
- ATTR_LIST(log_badcrc_factor),
-#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7c5a16528d8b..bcc3cdf8e1c5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -251,7 +251,7 @@ TRACE_EVENT(xfs_iext_insert,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
(long)__entry->idx,
__entry->startoff,
- (__int64_t)__entry->startblock,
+ (int64_t)__entry->startblock,
__entry->blockcount,
__entry->state,
(char *)__entry->caller_ip)
@@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
(long)__entry->idx,
__entry->startoff,
- (__int64_t)__entry->startblock,
+ (int64_t)__entry->startblock,
__entry->blockcount,
__entry->state,
(char *)__entry->caller_ip)
@@ -367,6 +367,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
@@ -1280,7 +1281,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->count,
__print_symbolic(__entry->type, XFS_IO_TYPES),
__entry->startoff,
- (__int64_t)__entry->startblock,
+ (int64_t)__entry->startblock,
__entry->blockcount)
)
@@ -1490,25 +1491,6 @@ TRACE_EVENT(xfs_extent_busy_trim,
__entry->tlen)
);
-TRACE_EVENT(xfs_trans_commit_lsn,
- TP_PROTO(struct xfs_trans *trans),
- TP_ARGS(trans),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(struct xfs_trans *, tp)
- __field(xfs_lsn_t, lsn)
- ),
- TP_fast_assign(
- __entry->dev = trans->t_mountp->m_super->s_dev;
- __entry->tp = trans;
- __entry->lsn = trans->t_commit_lsn;
- ),
- TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->tp,
- __entry->lsn)
-);
-
TRACE_EVENT(xfs_agf,
TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
unsigned long caller_ip),
@@ -2057,7 +2039,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
TP_ARGS(log, buf_f),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(__int64_t, blkno)
+ __field(int64_t, blkno)
__field(unsigned short, len)
__field(unsigned short, flags)
__field(unsigned short, size)
@@ -2106,7 +2088,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
__field(int, fields)
__field(unsigned short, asize)
__field(unsigned short, dsize)
- __field(__int64_t, blkno)
+ __field(int64_t, blkno)
__field(int, len)
__field(int, boffset)
),
@@ -3256,8 +3238,8 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
__field(xfs_agnumber_t, agno)
__field(xfs_fsblock_t, bno)
__field(xfs_filblks_t, len)
- __field(__uint64_t, owner)
- __field(__uint64_t, offset)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
__field(unsigned int, flags)
),
TP_fast_assign(
@@ -3297,9 +3279,9 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class,
__field(dev_t, keydev)
__field(xfs_daddr_t, block)
__field(xfs_daddr_t, len)
- __field(__uint64_t, owner)
- __field(__uint64_t, offset)
- __field(__uint64_t, flags)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(uint64_t, flags)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a07acbf0bd8a..6bdad6f58934 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -105,10 +105,6 @@ typedef struct xfs_trans {
unsigned int t_rtx_res; /* # of rt extents resvd */
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
struct xlog_ticket *t_ticket; /* log mgr ticket */
- xfs_lsn_t t_lsn; /* log seq num of start of
- * transaction. */
- xfs_lsn_t t_commit_lsn; /* log seq num of end of
- * transaction. */
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
unsigned int t_flags; /* misc flags */
@@ -249,7 +245,7 @@ struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
struct xfs_rui_log_item *ruip);
int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
- __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ uint64_t owner, int whichfork, xfs_fileoff_t startoff,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
@@ -275,6 +271,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
enum xfs_bmap_intent_type type, struct xfs_inode *ip,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t blockcount, xfs_exntst_t state);
+ xfs_filblks_t *blockcount, xfs_exntst_t state);
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 6408e7d7c08c..14543d93cd4b 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
+ xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
int error;
@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
void **state)
{
struct xfs_bmap_intent *bmap;
+ xfs_filblks_t count;
int error;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ count = bmap->bi_bmap.br_blockcount;
error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
bmap->bi_type,
bmap->bi_owner, bmap->bi_whichfork,
bmap->bi_bmap.br_startoff,
bmap->bi_bmap.br_startblock,
- bmap->bi_bmap.br_blockcount,
+ &count,
bmap->bi_bmap.br_state);
+ if (!error && count > 0) {
+ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+ bmap->bi_bmap.br_blockcount = count;
+ return -EAGAIN;
+ }
kmem_free(bmap);
return error;
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee29ca132dc..86987d823d76 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
xfs_buf_t *bp)
{
xfs_buf_log_item_t *bip;
+ int freed;
/*
* Default to a normal brelse() call if the tp is NULL.
@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp,
/*
* Drop our reference to the buf log item.
*/
- atomic_dec(&bip->bli_refcount);
+ freed = atomic_dec_and_test(&bip->bli_refcount);
/*
- * If the buf item is not tracking data in the log, then
- * we must free it before releasing the buffer back to the
- * free pool. Before releasing the buffer to the free pool,
- * clear the transaction pointer in b_fsprivate2 to dissolve
- * its relation to this transaction.
+ * If the buf item is not tracking data in the log, then we must free it
+ * before releasing the buffer back to the free pool.
+ *
+ * If the fs has shutdown and we dropped the last reference, it may fall
+ * on us to release a (possibly dirty) bli if it never made it to the
+ * AIL (e.g., the aborted unpin already happened and didn't release it
+ * due to our reference). Since we're already shutdown and need xa_lock,
+ * just force remove from the AIL and release the bli here.
*/
- if (!xfs_buf_item_dirty(bip)) {
+ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+ xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bp);
+ } else if (!xfs_buf_item_dirty(bip)) {
/***
ASSERT(bp->b_pincount == 0);
***/
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 9ead064b5e90..9b577beb43d7 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -96,7 +96,7 @@ xfs_trans_log_finish_rmap_update(
struct xfs_trans *tp,
struct xfs_rud_log_item *rudp,
enum xfs_rmap_intent_type type,
- __uint64_t owner,
+ uint64_t owner,
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,