diff options
Diffstat (limited to 'fs')
111 files changed, 2249 insertions, 1235 deletions
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 5e376bb93419..8defc6b3f9a2 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -40,7 +40,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) int block, off; inode = iget_locked(sb, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; @@ -1045,12 +1045,22 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret = 0; + struct bio_vec *bvec; + int ret = 0, i; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free. + */ + if (current->mm) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); + else if (bmd->is_our_pages) + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + } bio_free_map_data(bmd); bio_put(bio); return ret; @@ -1946,7 +1956,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/fs/block_dev.c b/fs/block_dev.c index c7bda5cd3da7..1173a4ee0830 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index eaf133384a8f..8bc5e8ccb091 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, u64 extent_item_pos, struct extent_inode_elem **eie) { - u64 data_offset; - u64 data_len; + u64 offset = 0; struct extent_inode_elem *e; - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - return 1; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) @@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + (extent_item_pos - data_offset); + e->offset = key->offset + offset; *eie = e; return 0; @@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct extent_inode_elem *eie = NULL; + struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; if (level != 0) { @@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; + old = NULL; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (ret < 0) break; } - if (!ret) { - ret = ulist_add(parents, eb->start, - (uintptr_t)eie, GFP_NOFS); - if (ret < 0) - break; - if (!extent_item_pos) { - ret = btrfs_next_old_leaf(root, path, - time_seq); - continue; - } + if (ret > 0) + goto next; + ret = ulist_add_merge(parents, eb->start, + (uintptr_t)eie, + (u64 *)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; } } +next: ret = btrfs_next_old_item(root, path, time_seq); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5bf4c39e2ad6..ed504607d8ec 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1271,7 +1271,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, BUG_ON(!eb_rewin); } - extent_buffer_get(eb_rewin); btrfs_tree_read_unlock(eb); free_extent_buffer(eb); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4253ad580e39..5f8f3341c099 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -747,7 +747,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_RET(task); + return PTR_ERR_OR_ZERO(task); } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 583d98bd065e..fe443fece851 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4048,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - u64 offset_in_extent; + u64 offset_in_extent = 0; /* break if the extent we found is outside the range */ if (em->start >= max || extent_map_end(em) < off) @@ -4064,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * record the offset from the start of the extent - * for adjusting the disk offset below + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. */ - offset_in_extent = em_start - em->start; + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; emflags = em->flags; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a005fe2c072a..4d2eb6417145 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -596,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -620,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -630,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } ret = add_extent_mapping(em_tree, split, modified); @@ -1709,7 +1727,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; - if (num_written > 0 || num_written == -EIOCBQUEUED) { + if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) num_written = err; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d1b93c8aafb..7bdc83d04d54 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2166,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) continue; - extent_offset = btrfs_file_extent_offset(leaf, extent); - if (key.offset - extent_offset != offset) + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) continue; + extent_offset = btrfs_file_extent_offset(leaf, extent); num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + old->len || extent_offset + num_bytes <= old->extent_offset + old->offset) continue; + ret = 0; break; } @@ -2187,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, backref->root_id = root_id; backref->inum = inum; - backref->file_pos = offset + extent_offset; + backref->file_pos = offset; backref->num_bytes = num_bytes; backref->extent_offset = extent_offset; backref->generation = btrfs_file_extent_generation(leaf, extent); @@ -2210,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, new->path = path; list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr, fs_info, + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, path, record_one_backref, old); BUG_ON(ret < 0 && ret != -ENOENT); @@ -3166,7 +3174,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - ret = PTR_RET(inode); + ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ESTALE) goto out; @@ -4391,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) int mask = attr->ia_valid; int ret; - if (newsize == oldsize) - return 0; - /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having @@ -5165,14 +5170,31 @@ next: } /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - ctx->pos = 0x7fffffff; - else - ctx->pos++; + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } nopos: ret = 0; err: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d58cce77fc6c..af1931a5960d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -983,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +void btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add_tail(&root->root_list, &root->fs_info->dead_roots); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); - return 0; } /* @@ -1925,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - list_del(&root->root_list); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); pr_debug("btrfs: cleaner removing %llu\n", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 005b0375d18c..defbc4269897 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -143,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); +void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2c6791493637..ff60d8978ae2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3746,8 +3746,9 @@ next_slot: } log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); if (fast_search) { - btrfs_release_path(dst_path); ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; @@ -3764,8 +3765,6 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(path); - btrfs_release_path(dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); if (ret) { err = ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78b871753cb6..67a085381845 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3302,7 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - return PTR_RET(tsk); + return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 45e57cc38200..fc6f4f3a1a9d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -43,17 +43,18 @@ cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(server->secmech.md5)) { cifs_dbg(VFS, "could not allocate crypto md5\n"); - return PTR_ERR(server->secmech.md5); + rc = PTR_ERR(server->secmech.md5); + server->secmech.md5 = NULL; + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.md5); server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdescmd5) { - rc = -ENOMEM; crypto_free_shash(server->secmech.md5); server->secmech.md5 = NULL; - return rc; + return -ENOMEM; } server->secmech.sdescmd5->shash.tfm = server->secmech.md5; server->secmech.sdescmd5->shash.flags = 0x0; @@ -421,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) if (blobptr + attrsize > blobend) break; if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { - if (!attrsize) + if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN) break; if (!ses->domainName) { ses->domainName = @@ -591,6 +592,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) { + int rc; unsigned int size; /* check if already allocated */ @@ -600,7 +602,9 @@ static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (IS_ERR(server->secmech.hmacmd5)) { cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); - return PTR_ERR(server->secmech.hmacmd5); + rc = PTR_ERR(server->secmech.hmacmd5); + server->secmech.hmacmd5 = NULL; + return rc; } size = sizeof(struct shash_desc) + diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4bdd547dbf6f..85ea98d139fc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } + if (cifs_sb_master_tcon(cifs_sb)->nocase) + sb->s_d_op = &cifs_ci_dentry_ops; + else + sb->s_d_op = &cifs_dentry_ops; + sb->s_root = d_make_root(inode); if (!sb->s_root) { rc = -ENOMEM; goto out_no_root; } - /* do that *after* d_make_root() - we want NULL ->d_op for root here */ - if (cifs_sb_master_tcon(cifs_sb)->nocase) - sb->s_d_op = &cifs_ci_dentry_ops; - else - sb->s_d_op = &cifs_dentry_ops; - #ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifs_dbg(FYI, "export ops supported\n"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1fdc37041057..52ca861ed35e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -44,6 +44,7 @@ #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) #define MAX_SERVER_SIZE 15 #define MAX_SHARE_SIZE 80 +#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */ #define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ #define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ @@ -369,6 +370,9 @@ struct smb_version_operations { void (*generate_signingkey)(struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *rqst, struct TCP_Server_Info *server); + int (*query_mf_symlink)(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid); }; struct smb_version_values { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f7e584d047e2..b29a012bed33 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -497,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); void cifs_writedata_release(struct kref *refcount); - +int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fa68813396b5..d67c550c4980 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1675,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnlen(string, 256) == 256) { + if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN) + == CIFS_MAX_DOMAINNAME_LEN) { printk(KERN_WARNING "CIFS: domain name too" " long\n"); goto cifs_parse_mount_err; @@ -2276,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses) #ifdef CONFIG_KEYS -/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */ -#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1) +/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */ +#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1) /* Populate username and pw fields from keyring if possible */ static int diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1e57f36ea1b2..9d0dd952ad79 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -647,6 +647,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) oflags, &oplock, &cfile->fid.netfid, xid); if (rc == 0) { cifs_dbg(FYI, "posix reopen succeeded\n"); + oparms.reconnect = true; goto reopen_success; } /* @@ -2552,7 +2553,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, mutex_unlock(&inode->i_mutex); } - if (rc > 0 || rc == -EIOCBQUEUED) { + if (rc > 0) { ssize_t err; err = generic_write_sync(file, pos, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index b83c3f5646bd..562044f700e5 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr) } int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid) { int rc; int oplock = 0; __u16 netfid = 0; struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *ptcon; struct cifs_io_parms io_parms; - u8 *buf; - char *pbuf; - unsigned int bytes_read = 0; int buf_type = CIFS_NO_BUFFER; - unsigned int link_len = 0; FILE_ALL_INFO file_info; - if (!CIFSCouldBeMFSymlink(fattr)) - /* it's not a symlink */ - return 0; - tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + ptcon = tlink_tcon(tlink); - rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, + rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ, CREATE_NOT_DIR, &netfid, &oplock, &file_info, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) - goto out; + if (rc != 0) { + cifs_put_tlink(tlink); + return rc; + } if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, pTcon, netfid); + CIFSSMBClose(xid, ptcon, netfid); + cifs_put_tlink(tlink); /* it's not a symlink */ - goto out; + return rc; } - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } - pbuf = buf; io_parms.netfid = netfid; io_parms.pid = current->tgid; - io_parms.tcon = pTcon; + io_parms.tcon = ptcon; io_parms.offset = 0; io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, pTcon, netfid); - if (rc != 0) { - kfree(buf); + rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); + CIFSSMBClose(xid, ptcon, netfid); + cifs_put_tlink(tlink); + return rc; +} + + +int +CIFSCheckMFSymlink(struct cifs_fattr *fattr, + const unsigned char *path, + struct cifs_sb_info *cifs_sb, unsigned int xid) +{ + int rc = 0; + u8 *buf = NULL; + unsigned int link_len = 0; + unsigned int bytes_read = 0; + struct cifs_tcon *ptcon; + + if (!CIFSCouldBeMFSymlink(fattr)) + /* it's not a symlink */ + return 0; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; goto out; } + ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); + if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) + rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, + &bytes_read, cifs_sb, xid); + else + goto out; + + if (rc != 0) + goto out; + + if (bytes_read == 0) /* not a symlink */ + goto out; + rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); - kfree(buf); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr, fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; out: - cifs_put_tlink(tlink); + kfree(buf); return rc; } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ab8778469394..69d2c826a23b 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; } + /* + * If we know that the inode will need to be revalidated immediately, + * then don't create a new dentry for it. We'll end up doing an on + * the wire call either way and this spares us an invalidation. + */ + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + return; + dentry = d_alloc(parent, name); if (!dentry) return; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 79358e341fd2..08dd37bb23aa 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -197,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, bytes_ret = 0; } else bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, - 256, nls_cp); + CIFS_MAX_DOMAINNAME_LEN, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null terminator */ @@ -255,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy domain */ if (ses->domainName != NULL) { - strncpy(bcc_ptr, ses->domainName, 256); - bcc_ptr += strnlen(ses->domainName, 256); + strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); } /* else we will send a null domain name so the server will default to its own domain */ *bcc_ptr = 0; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 6457690731a2..60943978aec3 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -944,6 +944,7 @@ struct smb_version_operations smb1_operations = { .mand_lock = cifs_mand_lock, .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, + .query_mf_symlink = open_query_close_cifs_symlink, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 301b191270b9..4f2300d020c7 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -42,6 +42,7 @@ static int smb2_crypto_shash_allocate(struct TCP_Server_Info *server) { + int rc; unsigned int size; if (server->secmech.sdeschmacsha256 != NULL) @@ -50,7 +51,9 @@ smb2_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); if (IS_ERR(server->secmech.hmacsha256)) { cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); - return PTR_ERR(server->secmech.hmacsha256); + rc = PTR_ERR(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + return rc; } size = sizeof(struct shash_desc) + @@ -87,7 +90,9 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.sdeschmacsha256 = NULL; crypto_free_shash(server->secmech.hmacsha256); server->secmech.hmacsha256 = NULL; - return PTR_ERR(server->secmech.cmacaes); + rc = PTR_ERR(server->secmech.cmacaes); + server->secmech.cmacaes = NULL; + return rc; } size = sizeof(struct shash_desc) + diff --git a/fs/dcache.c b/fs/dcache.c index 87bdb5329c3c..5aa53bc056ba 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_count); + BUG_ON(dentry->d_lockref.count); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -467,12 +467,12 @@ relock: } if (ref) - dentry->d_count--; + dentry->d_lockref.count--; /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -513,15 +513,10 @@ void dput(struct dentry *dentry) return; repeat: - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) might_sleep(); - spin_lock(&dentry->d_lock); - BUG_ON(!dentry->d_count); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } if (dentry->d_flags & DCACHE_OP_DELETE) { if (dentry->d_op->d_delete(dentry)) @@ -535,7 +530,7 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); - dentry->d_count--; + dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; @@ -590,7 +585,7 @@ int d_invalidate(struct dentry * dentry) * We also need to leave mountpoints alone, * directory or not. */ - if (dentry->d_count > 1 && dentry->d_inode) { + if (dentry->d_lockref.count > 1 && dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; @@ -606,20 +601,33 @@ EXPORT_SYMBOL(d_invalidate); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { - dentry->d_count++; + dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { - spin_lock(&dentry->d_lock); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); + lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under @@ -634,8 +642,8 @@ repeat: goto repeat; } rcu_read_unlock(); - BUG_ON(!ret->d_count); - ret->d_count++; + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } @@ -718,7 +726,15 @@ restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_count) { + if (!dentry->d_lockref.count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -763,12 +779,8 @@ static void try_prune_one_dentry(struct dentry *dentry) /* Prune ancestors. */ dentry = parent; while (dentry) { - spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } dentry = dentry_kill(dentry, 1); } } @@ -793,7 +805,7 @@ static void shrink_dentry_list(struct list_head *list) * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; @@ -907,13 +919,14 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); __d_shrink(dentry); - if (dentry->d_count != 0) { + if (dentry->d_lockref.count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -922,7 +935,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - dentry->d_count, + dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); @@ -933,7 +946,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - parent->d_count--; + parent->d_lockref.count--; list_del(&dentry->d_u.d_child); } @@ -981,7 +994,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - dentry->d_count--; + dentry->d_lockref.count--; shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -1147,7 +1160,7 @@ resume: * loop in shrink_dcache_parent() might not make any progress * and loop forever. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_list(dentry, dispose); @@ -1269,7 +1282,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_wmb(); dentry->d_name.name = dname; - dentry->d_count = 1; + dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); @@ -1782,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of @@ -1970,7 +1983,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) goto next; } - dentry->d_count++; + dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -2069,7 +2082,7 @@ again: spin_lock(&dentry->d_lock); inode = dentry->d_inode; isdir = S_ISDIR(inode->i_mode); - if (dentry->d_count == 1) { + if (dentry->d_lockref.count == 1) { if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); cpu_relax(); @@ -2724,6 +2737,17 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, return memcpy(buffer, temp, sz); } +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} + /* * Write full pathname from the root of the filesystem into the buffer. */ @@ -2937,7 +2961,7 @@ resume: } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_count--; + dentry->d_lockref.count--; } spin_unlock(&dentry->d_lock); } @@ -2945,7 +2969,7 @@ resume: struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_count--; + this_parent->d_lockref.count--; } this_parent = try_to_ascend(this_parent, locked, seq); if (!this_parent) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 4888cb3fdef7..c7c83ff0f752 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child; - struct dentry *parent; + struct dentry *child, *next, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry; + down: mutex_lock(&parent->d_inode->i_mutex); + list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + if (!debugfs_positive(child)) + continue; - while (1) { - /* - * When all dentries under "parent" has been removed, - * walk up the tree until we reach our starting point. - */ - if (list_empty(&parent->d_subdirs)) { - mutex_unlock(&parent->d_inode->i_mutex); - if (parent == dentry) - break; - parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); - } - child = list_entry(parent->d_subdirs.next, struct dentry, - d_u.d_child); - next_sibling: - - /* - * If "child" isn't empty, walk down the tree and - * remove all its descendants first. - */ + /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { mutex_unlock(&parent->d_inode->i_mutex); parent = child; - mutex_lock(&parent->d_inode->i_mutex); - continue; + goto down; } - __debugfs_remove(child, parent); - if (parent->d_subdirs.next == &child->d_u.d_child) { - /* - * Try the next sibling. - */ - if (child->d_u.d_child.next != &parent->d_subdirs) { - child = list_entry(child->d_u.d_child.next, - struct dentry, - d_u.d_child); - goto next_sibling; - } - - /* - * Avoid infinite loop if we fail to remove - * one dentry. - */ - mutex_unlock(&parent->d_inode->i_mutex); - break; - } - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + up: + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } - parent = dentry->d_parent; + mutex_unlock(&parent->d_inode->i_mutex); + child = parent; + parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - __debugfs_remove(dentry, parent); + + if (child != dentry) { + next = list_entry(child->d_u.d_child.next, struct dentry, + d_u.d_child); + goto up; + } + + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); mutex_unlock(&parent->d_inode->i_mutex); - simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..1782023bd68a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -127,6 +127,7 @@ struct dio { spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ + bool defer_completion; /* defer AIO completion to workqueue? */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -141,7 +142,10 @@ struct dio { * allocation time. Don't add new fields after pages[] unless you * wish that they not be zeroed. */ - struct page *pages[DIO_PAGES]; /* page buffer */ + union { + struct page *pages[DIO_PAGES]; /* page buffer */ + struct work_struct complete_work;/* deferred AIO completion */ + }; } ____cacheline_aligned_in_smp; static struct kmem_cache *dio_cache __read_mostly; @@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio, * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation * - * This releases locks as dictated by the locking type, lets interested parties - * know that a DIO operation has completed, and calculates the resulting return - * code for the operation. + * This drops i_dio_count, lets interested parties know that a DIO operation + * has completed, and calculates the resulting return code for the operation. * * It lets the filesystem know if it registered an interest earlier via * get_block. Pass the private field of the map buffer_head so that * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, + bool is_async) { ssize_t transferred = 0; @@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (ret == 0) ret = transferred; - if (dio->end_io && dio->result) { - dio->end_io(dio->iocb, offset, transferred, - dio->private, ret, is_async); - } else { - inode_dio_done(dio->inode); - if (is_async) - aio_complete(dio->iocb, ret, 0); + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, dio->private); + + inode_dio_done(dio->inode); + if (is_async) { + if (dio->rw & WRITE) { + int err; + + err = generic_write_sync(dio->iocb->ki_filp, offset, + transferred); + if (err < 0 && ret > 0) + ret = err; + } + + aio_complete(dio->iocb, ret, 0); } + kmem_cache_free(dio_cache, dio); return ret; } +static void dio_aio_complete_work(struct work_struct *work) +{ + struct dio *dio = container_of(work, struct dio, complete_work); + + dio_complete(dio, dio->iocb->ki_pos, 0, true); +} + static int dio_bio_complete(struct dio *dio, struct bio *bio); + /* * Asynchronous IO callback. */ @@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); - kmem_cache_free(dio_cache, dio); + if (dio->result && dio->defer_completion) { + INIT_WORK(&dio->complete_work, dio_aio_complete_work); + queue_work(dio->inode->i_sb->s_dio_done_wq, + &dio->complete_work); + } else { + dio_complete(dio, dio->iocb->ki_pos, 0, true); + } } } @@ -511,6 +537,41 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) } /* + * Create workqueue for deferred direct IO completions. We allocate the + * workqueue when it's first needed. This avoids creating workqueue for + * filesystems that don't need it and also allows us to create the workqueue + * late enough so the we can include s_id in the name of the workqueue. + */ +static int sb_init_dio_done_wq(struct super_block *sb) +{ + struct workqueue_struct *wq = alloc_workqueue("dio/%s", + WQ_MEM_RECLAIM, 0, + sb->s_id); + if (!wq) + return -ENOMEM; + /* + * This has to be atomic as more DIOs can race to create the workqueue + */ + cmpxchg(&sb->s_dio_done_wq, NULL, wq); + /* Someone created workqueue before us? Free ours... */ + if (wq != sb->s_dio_done_wq) + destroy_workqueue(wq); + return 0; +} + +static int dio_set_defer_completion(struct dio *dio) +{ + struct super_block *sb = dio->inode->i_sb; + + if (dio->defer_completion) + return 0; + dio->defer_completion = true; + if (!sb->s_dio_done_wq) + return sb_init_dio_done_wq(sb); + return 0; +} + +/* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). @@ -581,6 +642,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, /* Store for completion */ dio->private = map_bh->b_private; + + if (ret == 0 && buffer_defer_completion(map_bh)) + ret = dio_set_defer_completion(dio); } return ret; } @@ -1129,11 +1193,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * Will be decremented at I/O completion time. - */ - atomic_inc(&inode->i_dio_count); - - /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So * even for AIO, we need to wait for i/o to complete before @@ -1141,11 +1200,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - - retval = 0; - dio->inode = inode; dio->rw = rw; + + /* + * For AIO O_(D)SYNC writes we need to defer completions to a workqueue + * so that we can call ->fsync. + */ + if (dio->is_async && (rw & WRITE) && + ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host))) { + retval = dio_set_defer_completion(dio); + if (retval) { + /* + * We grab i_mutex only for reads so we don't have + * to release it here + */ + kmem_cache_free(dio_cache, dio); + goto out; + } + } + + /* + * Will be decremented at I/O completion time. + */ + atomic_inc(&inode->i_dio_count); + + retval = 0; sdio.blkbits = blkbits; sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; @@ -1269,7 +1350,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (drop_refcount(dio) == 0) { retval = dio_complete(dio, offset, retval, false); - kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 27a6ba9aaeec..0e90f0c91b93 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -267,10 +267,7 @@ void dlm_callback_work(struct work_struct *work) int dlm_callback_start(struct dlm_ls *ls) { ls->ls_callback_wq = alloc_workqueue("dlm_callback", - WQ_UNBOUND | - WQ_MEM_RECLAIM | - WQ_NON_REENTRANT, - 0); + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!ls->ls_callback_wq) { log_print("can't start dlm_callback workqueue"); return -ENOMEM; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 911649a47dd5..142e21655eed 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -493,7 +493,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, { struct dlm_user_proc *proc = file->private_data; struct dlm_write_request *kbuf; - sigset_t tmpsig, allsigs; int error; #ifdef CONFIG_COMPAT @@ -557,9 +556,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, goto out_free; } - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - error = -EINVAL; switch (kbuf->cmd) @@ -567,7 +563,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_LOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_lock(proc, &kbuf->i.lock); break; @@ -575,7 +571,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_UNLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_unlock(proc, &kbuf->i.lock); break; @@ -583,7 +579,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_DEADLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_deadlock(proc, &kbuf->i.lock); break; @@ -591,7 +587,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_CREATE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_create_lockspace(&kbuf->i.lspace); break; @@ -599,7 +595,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_REMOVE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_remove_lockspace(&kbuf->i.lspace); break; @@ -607,7 +603,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_PURGE: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_purge(proc, &kbuf->i.purge); break; @@ -617,8 +613,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, kbuf->cmd); } - out_sig: - sigprocmask(SIG_SETMASK, &tmpsig, NULL); out_free: kfree(kbuf); return error; @@ -659,15 +653,11 @@ static int device_close(struct inode *inode, struct file *file) { struct dlm_user_proc *proc = file->private_data; struct dlm_ls *ls; - sigset_t tmpsig, allsigs; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); dlm_clear_proc_locks(ls, proc); @@ -685,9 +675,6 @@ static int device_close(struct inode *inode, struct file *file) /* FIXME: AUTOFREE: if this ls is no longer used do device_remove_lockspace() */ - sigprocmask(SIG_SETMASK, &tmpsig, NULL); - recalc_sigpending(); - return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index f3913eb2c474..d15ccf20f1b3 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -57,7 +57,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) struct inode *inode; inode = iget_locked(super, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9ad17b15b454..293f86741ddb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1792,7 +1792,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, { int error; int did_lock_epmutex = 0; - struct file *file, *tfile; + struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; @@ -1802,20 +1802,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; - /* Get the "struct file *" for the eventpoll file */ error = -EBADF; - file = fget(epfd); - if (!file) + f = fdget(epfd); + if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) + tf = fdget(fd); + if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; - if (!tfile->f_op || !tfile->f_op->poll) + if (!tf.file->f_op || !tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ @@ -1828,14 +1827,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !is_file_epoll(file)) + if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file @@ -1854,14 +1853,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, did_lock_epmutex = 1; } if (op == EPOLL_CTL_ADD) { - if (is_file_epoll(tfile)) { + if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) { + if (ep_loop_check(ep, tf.file) != 0) { clear_tfile_check_list(); goto error_tgt_fput; } } else - list_add(&tfile->f_tfile_llink, &tfile_check_list); + list_add(&tf.file->f_tfile_llink, &tfile_check_list); } mutex_lock_nested(&ep->mtx, 0); @@ -1871,14 +1870,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tfile, fd); + epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + error = ep_insert(ep, &epds, tf.file, fd); } else error = -EEXIST; clear_tfile_check_list(); @@ -1903,9 +1902,9 @@ error_tgt_fput: if (did_lock_epmutex) mutex_unlock(&epmutex); - fput(tfile); + fdput(tf); error_fput: - fput(file); + fdput(f); error_return: return error; diff --git a/fs/exec.c b/fs/exec.c index 9c73def87642..fd774c7cb483 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -608,7 +608,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. @@ -625,7 +625,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } - tlb_finish_mmu(&tlb, new_end, old_end); + tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index f522425aaa24..bafdd48eefde 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -41,7 +41,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ddd715e42a5c..dc5d572ebd6a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); @@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t offset; @@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return; @@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } set_buffer_verified(bh); @@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); - return bh; + if (buffer_verified(bh)) + return bh; + put_bh(bh); + return NULL; } /* Returns 0 on success, 1 on error */ @@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ ext4_validate_block_bitmap(sb, desc, block_group, bh); - return 0; + /* ...but check for error just in case errors=continue. */ + return !buffer_verified(bh); } struct buffer_head * diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3c7d288ae94c..680bb3388919 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b577e45425b0..af815ea9d7cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -180,7 +180,6 @@ struct ext4_map_blocks { * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_DIRECT 0x0002 /* * For converting uninitialized extents on a work queue. 'handle' is used for @@ -196,8 +195,6 @@ typedef struct ext4_io_end { unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct kiocb *iocb; /* iocb struct for AIO */ - int result; /* error value for AIO */ atomic_t count; /* reference counter */ } ext4_io_end_t; @@ -561,6 +558,18 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x0400 +#define EXT4_EX_FORCE_CACHE 0x0800 + +/* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 @@ -569,6 +578,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -590,6 +600,7 @@ enum { #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -900,11 +911,9 @@ struct ext4_inode_info { * Completed IOs that need unwritten extents handling and don't have * transaction reserved */ - struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; - struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -1276,8 +1285,6 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for unreserved extent convertions (dio) */ - struct workqueue_struct *unrsv_conversion_wq; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; @@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - /* Writeback has to have coversion transaction reserved */ - WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && - !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -1375,6 +1379,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb, extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -2086,6 +2091,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); @@ -2416,16 +2422,32 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !mutex_is_locked(&inode->i_mutex)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; up_write(&EXT4_I(inode)->i_data_sem); - return ; +} + +/* + * Update i_disksize after writeback has been started. Races with truncate + * are avoided by checking i_size under i_data_sem. + */ +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) +{ + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (newsize > i_size) + newsize = i_size; + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); } struct ext4_group_info { @@ -2448,9 +2470,15 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) @@ -2654,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int, struct ext4_ext_path; struct ext4_extent; +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); @@ -2683,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); + struct ext4_ext_path *, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, @@ -2692,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); - +extern int ext4_ext_precache(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2715,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); -extern void ext4_end_io_unrsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 51bc821ade90..5074fe23f19e 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -134,12 +134,6 @@ struct ext4_ext_path { */ /* - * Maximum number of logical blocks in a file; ext4_extent's ee_block is - * __le32. - */ -#define EXT_MAX_BLOCKS 0xffffffff - -/* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 72a3600aedbd..17ac112ab101 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); + /* Errors can only happen if there is a bug */ + if (WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2877258d9497..81cfefa9dc0c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister - * Return true if object was sucessfully removed + * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a61873808f76..54d52afcdb19 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth) + int depth, ext4_fsblk_t pblk) { const char *error_msg; int max = 0; @@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line, corrupted: ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return -EIO; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } -static int __ext4_ext_check_block(const char *function, unsigned int line, - struct inode *inode, - struct ext4_extent_header *eh, - int depth, - struct buffer_head *bh) +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) { - int ret; + struct buffer_head *bh; + int err; - if (buffer_verified(bh)) - return 0; - ret = ext4_ext_check(inode, eh, depth); - if (ret) - return ret; + bh = sb_getblk(inode->i_sb, pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); - return ret; + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_uninitialized(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + } -#define ext4_ext_check_block(inode, eh, depth, bh) \ - __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; @@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) { - ret = -ENOMEM; + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto err; } - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - ret = bh_submit_read(bh); - if (ret < 0) { - put_bh(bh); - goto err; - } - } + eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { @@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; - i--; - - ret = ext4_ext_check_block(inode, eh, i, bh); - if (ret < 0) - goto err; } path[ppos].p_depth = i; @@ -1198,7 +1293,8 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, + unsigned int mb_flags, + unsigned int gb_flags, struct ext4_ext_path *path, struct ext4_extent *newext) { @@ -1220,7 +1316,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; @@ -1228,12 +1324,12 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); if (err) goto out; @@ -1241,7 +1337,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1412,29 +1508,21 @@ got_index: ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_block(inode, eh, - path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = ext_block_hdr(bh); - if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); @@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | + EXT4_FREE_BLOCKS_RESERVE); } /* @@ -1793,7 +1882,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) + struct ext4_extent *newext, int gb_flags) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; - int flags = 0; + int mb_flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on @@ -1920,7 +2009,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); + npath = ext4_ext_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -1939,9 +2028,10 @@ prepend: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) - flags = EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags = EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2007,7 +2097,7 @@ has_space: merge: /* try to merge extents */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); @@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, path = NULL; } - path = ext4_ext_find_extent(inode, block, path); + path = ext4_ext_find_extent(inode, block, path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len; - ext4_lblk_t lblock; + unsigned long len = 0; + ext4_lblk_t lblock = 0; struct ext4_extent *ex; ex = path[depth].p_ext; @@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } else { - lblock = len = 0; BUG(); } @@ -2712,7 +2801,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL); + path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2754,6 +2843,7 @@ again: */ err = ext4_split_extent_at(handle, inode, path, end + 1, split_flag, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -2782,7 +2872,7 @@ again: path[0].p_hdr = ext_inode_hdr(inode); i = 0; - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EIO; goto out; } @@ -2829,10 +2919,12 @@ again: ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { /* should we reset i_size? */ - err = -EIO; + err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. @@ -2842,11 +2934,6 @@ again: err = -EIO; break; } - if (ext4_ext_check_block(inode, ext_block_hdr(bh), - depth - i - 1, bh)) { - err = -EIO; - break; - } path[i + 1].p_bh = bh; /* save actual number of indexes since this @@ -2961,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb) #endif } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { @@ -3113,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update extent status tree */ - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); goto out; } else if (err) @@ -3133,7 +3237,7 @@ fix_extent_len: * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * - * It may result in splitting the extent into multiple extents (upto three) + * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent @@ -3181,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle, * result in split of original leaf or extent zeroout. */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3464,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, out: /* If we have gotten a failure, don't zero out status tree */ if (!err) - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } @@ -3565,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (err < 0) goto out; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4052,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4412,7 +4516,7 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); - if (err == ENOMEM) { + if (err == -ENOMEM) { cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; @@ -4744,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -4771,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - + ext4_es_lru_add(inode); return error; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 91cb110da1b4..2d1bdbe78c04 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -13,7 +13,6 @@ #include <linux/list_sort.h> #include "ext4.h" #include "extents_status.h" -#include "ext4_extents.h" #include <trace/events/ext4.h> @@ -263,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, if (tree->cache_es) { es1 = tree->cache_es; if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %llx\n", + es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; @@ -409,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) } #ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + static void ext4_es_insert_extent_ext_check(struct inode *inode, struct extent_status *es) { @@ -419,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; @@ -641,13 +642,13 @@ out: */ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status) + unsigned int status) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; - es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); if (!len) @@ -684,6 +685,38 @@ error: } /* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. @@ -871,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) -{ - ext4_lblk_t ee_block; - ext4_fsblk_t ee_pblock; - unsigned int ee_len; - - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext4_ext_pblock(ex); - - if (ee_len == 0) - return 0; - - return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); -} - static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, struct list_head *b) { @@ -895,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, eia = list_entry(a, struct ext4_inode_info, i_es_lru); eib = list_entry(b, struct ext4_inode_info, i_es_lru); + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return 1; + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return -1; if (eia->i_touch_when == eib->i_touch_when) return 0; if (time_after(eia->i_touch_when, eib->i_touch_when)) @@ -908,21 +930,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, { struct ext4_inode_info *ei; struct list_head *cur, *tmp; - LIST_HEAD(skiped); + LIST_HEAD(skipped); int ret, nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); - /* - * If the inode that is at the head of LRU list is newer than - * last_sorted time, that means that we need to sort this list. - */ - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; - } - +retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { /* * If we have already reclaimed all extents from extent @@ -933,9 +947,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - /* Skip the inode that is newer than the last_sorted time */ - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_move_tail(cur, &skiped); + /* + * Skip the inode that is newer than the last_sorted + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ + if ((sbi->s_es_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; + list_move_tail(cur, &skipped); continue; } @@ -955,11 +976,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, } /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skiped, &sbi->s_es_lru); + list_splice_tail(&skipped, &sbi->s_es_lru); + INIT_LIST_HEAD(&skipped); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, sort the list and try again. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* + * If there are no non-precached inodes left on the + * list, start releasing precached extents. + */ + if (ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) + skip_precached = 0; + goto retry; + } + spin_unlock(&sbi->s_es_lru_lock); if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); return nr_shrunk; } @@ -1034,10 +1077,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct rb_node *node; struct extent_status *es; int nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (ei->i_es_lru_nr == 0) return 0; + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + node = rb_first(&tree->root); while (node != NULL) { es = rb_entry(node, struct extent_status, rb_node); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e936730cc5b0..167f4ab8ecc3 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,16 +29,26 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define EXTENT_STATUS_WRITTEN (1ULL << 63) -#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) -#define EXTENT_STATUS_DELAYED (1ULL << 61) -#define EXTENT_STATUS_HOLE (1ULL << 60) +#define ES_SHIFT 60 + +#define EXTENT_STATUS_WRITTEN (1 << 3) +#define EXTENT_STATUS_UNWRITTEN (1 << 2) +#define EXTENT_STATUS_DELAYED (1 << 1) +#define EXTENT_STATUS_HOLE (1 << 0) #define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +#define ES_WRITTEN (1ULL << 63) +#define ES_UNWRITTEN (1ULL << 62) +#define ES_DELAYED (1ULL << 61) +#define ES_HOLE (1ULL << 60) + +#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ + ES_DELAYED | ES_HOLE) + struct ext4_sb_info; struct ext4_extent; @@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status); + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_delayed_extent_range(struct inode *inode, @@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); -extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; + return (es->es_pblk & ES_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; + return (es->es_pblk & ES_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; + return (es->es_pblk & ES_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; + return (es->es_pblk & ES_HOLE) != 0; } -static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +static inline unsigned int ext4_es_status(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_FLAGS); + return es->es_pblk >> ES_SHIFT; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { - return (es->es_pblk & ~EXTENT_STATUS_FLAGS); + return es->es_pblk & ~ES_MASK; } static inline void ext4_es_store_pblock(struct extent_status *es, @@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es, { ext4_fsblk_t block; - block = (pb & ~EXTENT_STATUS_FLAGS) | - (es->es_pblk & EXTENT_STATUS_FLAGS); + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, - unsigned long long status) + unsigned int status) { - ext4_fsblk_t block; - - block = (status & EXTENT_STATUS_FLAGS) | - (es->es_pblk & ~EXTENT_STATUS_FLAGS); - es->es_pblk = block; + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (es->es_pblk & ~ES_MASK)); } extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6f4cc567c382..3da21945ff1f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); @@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; - } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; - } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); + if (filp->f_mode & FMODE_WRITE) { + int ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; } return dquot_file_open(inode, filp); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f03598c6ffd3..137193ff389b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +184,8 @@ verify: put_bh(bh); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } ext4_unlock_group(sb, block_group); @@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " @@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -315,8 +319,10 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -725,25 +777,39 @@ got_group: continue; } + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto out; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) { - if (++group == ngroups) - group = 0; - continue; - } + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); continue; } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -767,8 +833,12 @@ repeat_in_this_group: ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ +next_inode: if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; +next_group: + if (++group == ngroups) + group = 0; } err = -ENOSPC; goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 87b30cd357e7..594009f5f523 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -23,7 +23,6 @@ #include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" -#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ba33c67d6e48..c79fd7dabe79 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -553,16 +553,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -654,16 +653,15 @@ found: if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (allocation)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif /* * If the extent has been zeroed out, we don't need to update @@ -729,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } @@ -971,7 +973,8 @@ retry_journal: ext4_journal_stop(handle); goto retry_grab; } - wait_on_page_writeback(page); + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -1635,16 +1638,15 @@ add_delayed: set_buffer_delay(bh); } else if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -1893,12 +1895,32 @@ static int ext4_writepage(struct page *page, return ret; } +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). - * The rest of mballoc seems to handle chunks upto full group size. + * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 @@ -1907,82 +1929,94 @@ static int ext4_writepage(struct page *page, * * @mpd - extent of blocks * @lblk - logical number of the block in the file - * @b_state - b_state of the buffer head added + * @bh - buffer head we want to add to the extent * - * the function is used to collect contig. blocks in same state + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. */ -static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, - unsigned long b_state) +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; - /* Don't go larger than mballoc is willing to allocate */ - if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) - return 0; + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } /* First block in the extent? */ if (map->m_len == 0) { map->m_lblk = lblk; map->m_len = 1; - map->m_flags = b_state & BH_FLAGS; - return 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; } + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && - (b_state & BH_FLAGS) == map->m_flags) { + (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; - return 1; + return true; } - return 0; + return false; } -static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, - struct buffer_head *head, - struct buffer_head *bh, - ext4_lblk_t lblk) +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) { struct inode *inode = mpd->inode; + int err; ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) >> inode->i_blkbits; do { BUG_ON(buffer_locked(bh)); - if (!buffer_dirty(bh) || !buffer_mapped(bh) || - (!buffer_delay(bh) && !buffer_unwritten(bh)) || - lblk >= blocks) { + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) - return false; - if (lblk >= blocks) - return true; - continue; + return 0; + /* Everything mapped so far and we hit EOF */ + break; } - if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) - return false; } while (lblk++, (bh = bh->b_this_page) != head); - return true; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ - int len; - loff_t size = i_size_read(mpd->inode); - int err; - - BUG_ON(page->index != mpd->first_page); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); - if (!err) - mpd->wbc->nr_to_write--; - mpd->first_page++; - - return err; + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; } /* @@ -2006,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct buffer_head *head, *bh; int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) - >> inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; sector_t pblock; @@ -2029,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (page->index > end) break; - /* Upto 'end' pages must be contiguous */ + /* Up to 'end' pages must be contiguous */ BUG_ON(page->index != start); bh = head = page_buffers(page); do { @@ -2042,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) */ mpd->map.m_len = 0; mpd->map.m_flags = 0; - add_page_bufs_to_extent(mpd, head, bh, - lblk); + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); pagevec_release(&pvec); - return 0; + if (err > 0) + err = 0; + return err; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - } while (++lblk < blocks && - (bh = bh->b_this_page) != head); + } while (lblk++, (bh = bh->b_this_page) != head); /* * FIXME: This is going to break if dioread_nolock @@ -2202,12 +2242,10 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* Update on-disk size after IO is submitted */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); if (disksize > EXT4_I(inode)->i_disksize) { int err2; - ext4_update_i_disksize(inode, disksize); + ext4_wb_update_i_disksize(inode, disksize); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) ext4_error(inode->i_sb, @@ -2222,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of - * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ @@ -2322,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) lblk = ((ext4_lblk_t)page->index) << (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - if (!add_page_bufs_to_extent(mpd, head, head, lblk)) + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) goto out; - /* So far everything mapped? Submit the page for IO. */ - if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, page); - if (err < 0) - goto out; - } + err = 0; /* * Accumulated enough dirty pages? This doesn't apply @@ -2413,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping, if (ext4_should_dioread_nolock(inode)) { /* - * We may need to convert upto one extent per block in + * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); @@ -2649,7 +2683,7 @@ retry_journal: goto retry_grab; } /* In case writeback began while the page was unlocked */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -2994,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) + ssize_t size, void *private) { - struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; /* if not async direct IO just return */ - if (!io_end) { - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); + if (!io_end) return; - } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3016,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private = NULL; io_end->offset = offset; io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - ext4_put_io_end_defer(io_end); + ext4_put_io_end(io_end); } /* @@ -3105,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = -ENOMEM; goto retake_lock; } - io_end->flag |= EXT4_IO_END_DIRECT; /* * Grab reference for DIO. Will be dropped in ext4_end_io_dio() */ @@ -3150,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { WARN_ON(iocb->private != io_end); WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - WARN_ON(io_end->iocb); - /* - * Generic code already did inode_dio_done() so we - * have to clear EXT4_IO_END_DIRECT to not do it for - * the second time. - */ - io_end->flag = 0; ext4_put_io_end(io_end); iocb->private = NULL; } @@ -3536,6 +3552,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + + } + first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -3604,6 +3632,31 @@ out_mutex: return ret; } +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + /* * ext4_truncate() * @@ -3664,6 +3717,12 @@ void ext4_truncate(struct inode *inode) return; } + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else @@ -4526,7 +4585,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; + loff_t oldsize = inode->i_size; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4534,73 +4595,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } - } - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, - EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); + if (error) goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); goto err_out; } - } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != inode->i_size) { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); - } else - ext4_wait_for_tail_page_commit(inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; /* - * Truncate pagecache after we've waited for commit - * in data=journal mode to make pages freeable. + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). */ - truncate_pagecache(inode, oldsize, inode->i_size); + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else + i_size_write(inode, attr->ia_size); + + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } - ext4_truncate(inode); + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, oldsize, inode->i_size); } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9491ac0590f7..a569d335f804 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,7 +17,6 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" -#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) @@ -77,8 +76,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); - memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); - memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode1); + ext4_es_lru_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); @@ -622,6 +623,8 @@ resizefs_out: return 0; } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); default: return -ENOTTY; @@ -686,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_MOVE_EXT: case FITRIM: case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4bbbf13bd743..a41e3ba8cfaa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd; " + "block bitmap corrupt.", free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, inode ? inode->i_ino : 0, blocknr, "freeing already freed block " - "(bit %u)", block); + "(bit %u); block bitmap corrupt.", + block); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); mb_regenerate_buddy(e4b); goto done; } @@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); @@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (cr <= 2 && free < ac->ac_g_ex.fe_len) return 0; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4673,6 +4692,10 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. @@ -4784,7 +4807,6 @@ do_more: ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); @@ -4792,10 +4814,23 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - ext4_mb_unload_buddy(&e4b); - - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 49e8bdff9163..2ae73a80c19b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - path = ext4_ext_find_extent(inode, lb->first_block, NULL); + path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); @@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode) * superblock modification. * * For the tmp_inode we already have committed the - * trascation that created the inode. Later as and + * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e86dddbd8296..7fa4d855dbd5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path); + path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); if (IS_ERR(path)) ret = PTR_ERR(path); else if (path[ext_depth(inode)].p_ext == NULL) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0dbc4b..1bec5a5c1e45 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - handle_t *handle; + handle_t *handle = NULL; struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; + int retval; int inlined = 0, new_inlined = 0; struct ext4_dir_entry_2 *parent_de; @@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* @@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_bh = NULL; } } + if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old_inode); + + handle = ext4_journal_start(old_dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; @@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; } retval = 0; @@ -3195,9 +3201,8 @@ end_rename: brelse(dir_bh); brelse(old_bh); brelse(new_bh); - ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); + if (handle) + ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6625d210fb45..d7d0c7b46ed4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } - if (io_end->flag & EXT4_IO_END_DIRECT) - inode_dio_done(io_end->inode); - if (io_end->iocb) - aio_complete(io_end->iocb, io_end->result, 0); kmem_cache_free(io_end_cachep, io_end); } @@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io_end->handle) { - wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; - if (list_empty(&ei->i_rsv_conversion_list)) - queue_work(wq, &ei->i_rsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); - } else { - wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; - if (list_empty(&ei->i_unrsv_conversion_list)) - queue_work(wq, &ei->i_unrsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); - } + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } @@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } -void ext4_end_io_unrsv_work(struct work_struct *work) -{ - struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unrsv_conversion_work); - ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); -} - ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bca26f34edf4..049c8a8bdc0e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->unrsv_conversion_wq); flush_workqueue(sbi->rsv_conversion_wq); - destroy_workqueue(sbi->unrsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); - INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); - INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); return &ei->vfs_inode; } @@ -1134,8 +1130,8 @@ enum { Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, - Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -1179,6 +1175,7 @@ static const match_table_t tokens = { {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, @@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT2 0x0100 #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_STRING 0x0400 static const struct mount_opts { int token; @@ -1359,7 +1357,7 @@ static const struct mount_opts { {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, - MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1387,6 +1385,7 @@ static const struct mount_opts { {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_STRING}, {Opt_journal_ioprio, 0, MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, @@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - if (args->from && match_int(args, &arg)) + if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) return -1; if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) return -1; @@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } *journal_devnum = arg; + } else if (token == Opt_journal_path) { + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext4_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return -1; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return -1; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return -1; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); } else if (token == Opt_journal_ioprio) { if (arg > 7) { ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" @@ -3483,7 +3520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); + "both data=journal and dioread_nolock"); goto failed_mount; } if (test_opt(sb, DELALLOC)) @@ -3954,14 +3991,6 @@ no_journal: goto failed_mount4; } - EXT4_SB(sb)->unrsv_conversion_wq = - alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->unrsv_conversion_wq) { - printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; - goto failed_mount4; - } - /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -4115,8 +4144,6 @@ failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - if (EXT4_SB(sb)->unrsv_conversion_wq) - destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4564,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); - flush_workqueue(sbi->unrsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots @@ -4600,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); dquot_writeback_dquots(sb, -1); if (wait && test_opt(sb, BARRIER)) ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); @@ -4727,6 +4752,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + err = -EINVAL; + goto restore_opts; + } + } + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, "Abort forced by user"); @@ -5481,6 +5521,7 @@ static void __exit ext4_exit_fs(void) kset_unregister(ext4_kset); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_es(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/fcntl.c b/fs/fcntl.c index 6599222536eb..65343c3741ff 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -730,14 +730,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH + __FMODE_EXEC | O_PATH | __O_TMPFILE )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/file_table.c b/fs/file_table.c index b44e4c559786..322cd37626cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -385,6 +385,10 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb) */ void file_sb_list_add(struct file *file, struct super_block *sb) { + if (likely(!(file->f_mode & FMODE_WRITE))) + return; + if (!S_ISREG(file_inode(file)->i_mode)) + return; lg_local_lock(&files_lglock); __file_sb_list_add(file, sb); lg_local_unlock(&files_lglock); @@ -450,8 +454,6 @@ void mark_files_ro(struct super_block *sb) lg_global_lock(&files_lglock); do_file_list_for_each_entry(sb, f) { - if (!S_ISREG(file_inode(f)->i_mode)) - continue; if (!file_count(f)) continue; if (!(f->f_mode & FMODE_WRITE)) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index aef34b1e635e..adbfd66b380f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -568,6 +568,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev, return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); } +static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); static ssize_t cuse_class_abort_store(struct device *dev, struct device_attribute *attr, @@ -578,12 +579,14 @@ static ssize_t cuse_class_abort_store(struct device *dev, fuse_abort_conn(&cc->fc); return count; } +static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); -static struct device_attribute cuse_class_dev_attrs[] = { - __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), - __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), - { } +static struct attribute *cuse_class_dev_attrs[] = { + &dev_attr_waiting.attr, + &dev_attr_abort.attr, + NULL, }; +ATTRIBUTE_GROUPS(cuse_class_dev); static struct miscdevice cuse_miscdev = { .minor = MISC_DYNAMIC_MINOR, @@ -609,7 +612,7 @@ static int __init cuse_init(void) if (IS_ERR(cuse_class)) return PTR_ERR(cuse_class); - cuse_class->dev_attrs = cuse_class_dev_attrs; + cuse_class->dev_groups = cuse_class_dev_groups; rc = misc_register(&cuse_miscdev); if (rc) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9435384562a2..544a809819c3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1838,14 +1838,14 @@ int __init gfs2_glock_init(void) glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); - if (IS_ERR(glock_workqueue)) - return PTR_ERR(glock_workqueue); + if (!glock_workqueue) + return -ENOMEM; gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (IS_ERR(gfs2_delete_workqueue)) { + if (!gfs2_delete_workqueue) { destroy_workqueue(glock_workqueue); - return PTR_ERR(gfs2_delete_workqueue); + return -ENOMEM; } register_shrinker(&glock_shrinker); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5f2e5224c51c..e2e0a90396e7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,7 +47,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; @@ -57,7 +58,9 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; bh = bd->bd_bh; if (bh->b_state & b_state) { if (fsync) @@ -65,6 +68,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_ail_error(gl, bh); } gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); @@ -91,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl, 0); + __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -101,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - ret = gfs2_trans_begin(sdp, 0, revokes); + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + + ret = gfs2_trans_begin(sdp, 0, max_revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync); + __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index bbb2715171cd..64915eeae5a7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -594,7 +594,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_glock_dq_uninit(ghs); if (IS_ERR(d)) - return PTR_RET(d); + return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; @@ -1750,6 +1750,10 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, struct gfs2_holder gh; int ret; + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index e04d0e09ee7b..7b0f5043cf24 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void) goto fail_wq; gfs2_control_wq = alloc_workqueue("gfs2_control", - WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) goto fail_recovery; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a3f868ae3fd4..d19b30ababf1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, return inode; } +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_mutex. + */ +struct lock_class_key hugetlbfs_i_mmap_mutex_key; + static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) @@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_mutex, + &hugetlbfs_i_mmap_mutex_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -916,14 +926,8 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", - dentry->d_name.name); -} - static struct dentry_operations anon_ops = { - .d_dname = hugetlb_dname + .d_dname = simple_dname }; /* diff --git a/fs/inode.c b/fs/inode.c index d6dfb09c8280..93a0625b46e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1525,7 +1525,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct path *path) +void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = path->dentry->d_inode; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 559bec1a37b4..cf2fc0594063 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -343,14 +343,14 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct page *page = bh->b_page; __u8 *addr; __u32 csum32; + __be32 seq; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - sequence = cpu_to_be32(sequence); + seq = cpu_to_be32(sequence); addr = kmap_atomic(page); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), bh->b_size); kunmap_atomic(addr); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 02c7ad9d7a41..52032647dd4a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -130,9 +130,10 @@ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; } -static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { - __u32 csum, old_csum; + __u32 csum; + __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d4851464b57e..3929c50428b1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -178,7 +178,8 @@ static int jbd2_descr_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_block_tail *tail; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -190,8 +191,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j, calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } /* @@ -381,7 +381,8 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) { struct commit_header *h; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -392,21 +393,20 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { __u32 csum32; + __be32 seq; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; - sequence = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); + seq = cpu_to_be32(sequence); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); return tag->t_checksum == cpu_to_be16(csum32); @@ -808,7 +808,8 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_revoke_tail *tail; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -820,8 +821,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); tail->r_checksum = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } /* Scan a revoke record, marking all blocks mentioned as revoked. */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8743ba9c6742..984c2bbf4f61 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3047,6 +3047,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) dir_index = (u32) ctx->pos; + /* + * NFSv4 reserves cookies 1 and 2 for . and .. so the value + * we return to the vfs is one greater than the one we use + * internally. + */ + if (dir_index) + dir_index--; + if (dir_index > 1) { struct dir_table_slot dirtab_slot; @@ -3086,7 +3094,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); - ctx->pos = -1; + ctx->pos = DIREND; return 0; } } else { @@ -3094,14 +3102,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * self "." */ - ctx->pos = 0; + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ - ctx->pos = 1; + ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3122,22 +3130,23 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * - * pn = index = 0: First entry "." - * pn = 0; index = 1: Second entry ".." + * pn = 0; index = 1: First entry "." + * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; - if (dtpos == 0) { + if (dtpos < 2) { /* build "." entry */ + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; - dtoffset->index = 1; + dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { - if (dtoffset->index == 1) { + if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3228,6 +3237,12 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) } jfs_dirent->position = unique_pos++; } + /* + * We add 1 to the index because we may + * use a value of 2 internally, and NFSv4 + * doesn't like that. + */ + jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 01bfe7662751..41e491b8e5d7 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) nlm_init->protocol, nlm_version, nlm_init->hostname, nlm_init->noresvport, nlm_init->net); - if (host == NULL) { - lockd_down(nlm_init->net); - return ERR_PTR(-ENOLCK); - } + if (host == NULL) + goto out_nohost; + if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) + goto out_nobind; return host; +out_nobind: + nlmclnt_release_host(host); +out_nohost: + lockd_down(nlm_init->net); + return ERR_PTR(-ENOLCK); } EXPORT_SYMBOL_GPL(nlmclnt_init); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 9760ecb9b60f..acd394716349 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_args *argp = &req->a_args; struct nlm_lock *lock = &argp->lock; + char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); - lock->caller = utsname()->nodename; + lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - utsname()->nodename); + nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; diff --git a/fs/namei.c b/fs/namei.c index 8b61d103a8a7..f415c6683a83 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -494,6 +494,50 @@ static inline void unlock_rcu_walk(void) br_read_unlock(&vfsmount_lock); } +/* + * When we move over from the RCU domain to properly refcounted + * long-lived dentries, we need to check the sequence numbers + * we got before lookup very carefully. + * + * We cannot blindly increment a dentry refcount - even if it + * is not locked - if it is zero, because it may have gone + * through the final d_kill() logic already. + * + * So for a zero refcount, we need to get the spinlock (which is + * safe even for a dead dentry because the de-allocation is + * RCU-delayed), and check the sequence count under the lock. + * + * Once we have checked the sequence count, we know it is live, + * and since we hold the spinlock it cannot die from under us. + * + * In contrast, if the reference count wasn't zero, we can just + * increment the lockref without having to take the spinlock. + * Even if the sequence number ends up being stale, we haven't + * gone through the final dput() and killed the dentry yet. + */ +static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq) +{ + int gotref; + + gotref = lockref_get_or_lock(&dentry->d_lockref); + + /* Does the sequence number still match? */ + if (read_seqcount_retry(validate, seq)) { + if (gotref) + dput(dentry); + else + spin_unlock(&dentry->d_lock); + return -ECHILD; + } + + /* Get the ref now, if we couldn't get it originally */ + if (!gotref) { + dentry->d_lockref.count++; + spin_unlock(&dentry->d_lock); + } + return 0; +} + /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -518,29 +562,28 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->root.dentry != fs->root.dentry) goto err_root; } - spin_lock(&parent->d_lock); + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0) + goto err_root; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) + goto err_root; + if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0) goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); } - spin_unlock(&parent->d_lock); if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); @@ -551,10 +594,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->flags &= ~LOOKUP_RCU; return 0; -err_child: - spin_unlock(&dentry->d_lock); err_parent: - spin_unlock(&parent->d_lock); + dput(dentry); err_root: if (want_root) spin_unlock(&fs->lock); @@ -585,14 +626,11 @@ static int complete_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); + + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) { unlock_rcu_walk(); return -ECHILD; } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); unlock_rcu_walk(); } @@ -2184,6 +2222,188 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd, return s; } +/** + * umount_lookup_last - look up last component for umount + * @nd: pathwalk nameidata - currently pointing at parent directory of "last" + * @path: pointer to container for result + * + * This is a special lookup_last function just for umount. In this case, we + * need to resolve the path without doing any revalidation. + * + * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since + * mountpoints are always pinned in the dcache, their ancestors are too. Thus, + * in almost all cases, this lookup will be served out of the dcache. The only + * cases where it won't are if nd->last refers to a symlink or the path is + * bogus and it doesn't exist. + * + * Returns: + * -error: if there was an error during lookup. This includes -ENOENT if the + * lookup found a negative dentry. The nd->path reference will also be + * put in this case. + * + * 0: if we successfully resolved nd->path and found it to not to be a + * symlink that needs to be followed. "path" will also be populated. + * The nd->path reference will also be put. + * + * 1: if we successfully resolved nd->last and found it to be a symlink + * that needs to be followed. "path" will be populated with the path + * to the link, and nd->path will *not* be put. + */ +static int +umount_lookup_last(struct nameidata *nd, struct path *path) +{ + int error = 0; + struct dentry *dentry; + struct dentry *dir = nd->path.dentry; + + if (unlikely(nd->flags & LOOKUP_RCU)) { + WARN_ON_ONCE(1); + error = -ECHILD; + goto error_check; + } + + nd->flags &= ~LOOKUP_PARENT; + + if (unlikely(nd->last_type != LAST_NORM)) { + error = handle_dots(nd, nd->last_type); + if (!error) + dentry = dget(nd->path.dentry); + goto error_check; + } + + mutex_lock(&dir->d_inode->i_mutex); + dentry = d_lookup(dir, &nd->last); + if (!dentry) { + /* + * No cached dentry. Mounted dentries are pinned in the cache, + * so that means that this dentry is probably a symlink or the + * path doesn't actually point to a mounted dentry. + */ + dentry = d_alloc(dir, &nd->last); + if (!dentry) { + error = -ENOMEM; + } else { + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } + } + mutex_unlock(&dir->d_inode->i_mutex); + +error_check: + if (!error) { + if (!dentry->d_inode) { + error = -ENOENT; + dput(dentry); + } else { + path->dentry = dentry; + path->mnt = mntget(nd->path.mnt); + if (should_follow_link(dentry->d_inode, + nd->flags & LOOKUP_FOLLOW)) + return 1; + follow_mount(path); + } + } + terminate_walk(nd); + return error; +} + +/** + * path_umountat - look up a path to be umounted + * @dfd: directory file descriptor to start walk from + * @name: full pathname to walk + * @flags: lookup flags + * @nd: pathwalk nameidata + * + * Look up the given name, but don't attempt to revalidate the last component. + * Returns 0 and "path" will be valid on success; Retuns error otherwise. + */ +static int +path_umountat(int dfd, const char *name, struct path *path, unsigned int flags) +{ + struct file *base = NULL; + struct nameidata nd; + int err; + + err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, &nd); + if (err) + goto out; + + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd.flags & LOOKUP_RCU) { + err = unlazy_walk(&nd, NULL); + if (err) { + terminate_walk(&nd); + goto out; + } + } + + err = umount_lookup_last(&nd, path); + while (err > 0) { + void *cookie; + struct path link = *path; + err = may_follow_link(&link, &nd); + if (unlikely(err)) + break; + nd.flags |= LOOKUP_PARENT; + err = follow_link(&link, &nd, &cookie); + if (err) + break; + err = umount_lookup_last(&nd, path); + put_link(&nd, &link, cookie); + } +out: + if (base) + fput(base); + + if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) + path_put(&nd.root); + + return err; +} + +/** + * user_path_umountat - lookup a path from userland in order to umount it + * @dfd: directory file descriptor + * @name: pathname from userland + * @flags: lookup flags + * @path: pointer to container to hold result + * + * A umount is a special case for path walking. We're not actually interested + * in the inode in this situation, and ESTALE errors can be a problem. We + * simply want track down the dentry and vfsmount attached at the mountpoint + * and avoid revalidating the last component. + * + * Returns 0 and populates "path" on success. + */ +int +user_path_umountat(int dfd, const char __user *name, unsigned int flags, + struct path *path) +{ + struct filename *s = getname(name); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU); + if (unlikely(error == -ECHILD)) + error = path_umountat(dfd, s->name, path, flags); + if (unlikely(error == -ESTALE)) + error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL); + + if (likely(!error)) + audit_inode(s, path->dentry, 0); + + putname(s); + return error; +} + /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. @@ -3327,7 +3547,7 @@ void dentry_unhash(struct dentry *dentry) { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } diff --git a/fs/namespace.c b/fs/namespace.c index 7b1ca9ba0b0a..ad8ea9bc2518 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1318,7 +1318,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); @@ -1429,7 +1429,7 @@ struct vfsmount *collect_mounts(struct path *path) CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) - return NULL; + return ERR_CAST(tree); return &tree->mnt; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index af6e806044d7..941246f2b43d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -463,7 +463,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - nfs_setsecurity(inode, fattr, label); dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -963,9 +962,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); - + int ret; + if (mapping->nrpages != 0) { - int ret = invalidate_inode_pages2(mapping); + if (S_ISREG(inode->i_mode)) { + ret = nfs_sync_mapping(mapping); + if (ret < 0) + return ret; + } + ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf11799297c4..108a774095f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3071,15 +3071,13 @@ struct rpc_clnt * nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { + struct rpc_clnt *client = NFS_CLIENT(dir); int status; - struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); - if (status < 0) { - rpc_shutdown_client(client); + if (status < 0) return ERR_PTR(status); - } - return client; + return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; } static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 71fdc0dfa0d2..f6db66d8f647 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2478,6 +2478,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, if (server->flags & NFS_MOUNT_NOAC) sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) + if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0d4c410e4589..419572f33b72 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1524,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 280acef6f0dc..43f42290e5df 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1264,6 +1264,8 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) struct svc_cred *cr = &rqstp->rq_cred; u32 service; + if (!cr->cr_gss_mech) + return false; service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); return service == RPC_GSS_SVC_INTEGRITY || service == RPC_GSS_SVC_PRIVACY; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0c0f3ea90de5..d9454fe5653f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1816,10 +1816,7 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, static __be32 nfsd4_encode_path(const struct path *root, const struct path *path, __be32 **pp, int *buflen) { - struct path cur = { - .mnt = path->mnt, - .dentry = path->dentry, - }; + struct path cur = *path; __be32 *p = *pp; struct dentry **components = NULL; unsigned int ncomponents = 0; @@ -1859,14 +1856,19 @@ static __be32 nfsd4_encode_path(const struct path *root, while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; - unsigned int len = dentry->d_name.len; + unsigned int len; + spin_lock(&dentry->d_lock); + len = dentry->d_name.len; *buflen -= 4 + (XDR_QUADLEN(len) << 2); - if (*buflen < 0) + if (*buflen < 0) { + spin_unlock(&dentry->d_lock); goto out_free; + } WRITE32(len); WRITEMEM(dentry->d_name.name, len); dprintk("/%s", dentry->d_name.name); + spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; } @@ -3360,7 +3362,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how (SP4_NONE) */ + + 4 /* spr_how */ + + 8 /* spo_must_enforce, spo_must_allow */ + 8 /* so_minor_id */ + 4 /* so_major_id.len */ + (XDR_QUADLEN(major_id_sz) * 4) + @@ -3372,8 +3375,6 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, WRITE32(exid->seqid); WRITE32(exid->flags); - /* state_protect4_r. Currently only support SP4_NONE */ - BUG_ON(exid->spa_how != SP4_NONE); WRITE32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913784ab..2d8be51f90dc 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -345,8 +345,7 @@ static void nilfs_end_bio_write(struct bio *bio, int err) if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - bio_put(bio); - /* to be detected by submit_seg_bio() */ + /* to be detected by nilfs_segbuf_submit_bio() */ } if (!uptodate) @@ -377,12 +376,12 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); + segbuf->sb_nbio++; if (bio_flagged(bio, BIO_EOPNOTSUPP)) { bio_put(bio); err = -EOPNOTSUPP; goto failed; } - segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index af3ba0478cdf..7ac2a122ca1d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -994,23 +994,16 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, return ret; } -static int nilfs_tree_was_touched(struct dentry *root_dentry) -{ - return d_count(root_dentry) > 1; -} - /** - * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint + * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint * @root_dentry: root dentry of the tree to be shrunk * * This function returns true if the tree was in-use. */ -static int nilfs_try_to_shrink_tree(struct dentry *root_dentry) +static bool nilfs_tree_is_busy(struct dentry *root_dentry) { - if (have_submounts(root_dentry)) - return true; shrink_dcache_parent(root_dentry); - return nilfs_tree_was_touched(root_dentry); + return d_count(root_dentry) > 1; } int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) @@ -1034,8 +1027,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) if (inode) { dentry = d_find_alias(inode); if (dentry) { - if (nilfs_tree_was_touched(dentry)) - ret = nilfs_try_to_shrink_tree(dentry); + ret = nilfs_tree_is_busy(dentry); dput(dentry); } iput(inode); @@ -1331,11 +1323,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } else if (!sd.cno) { - int busy = false; - - if (nilfs_tree_was_touched(s->s_root)) { - busy = nilfs_try_to_shrink_tree(s->s_root); - if (busy && (flags ^ s->s_flags) & MS_RDONLY) { + if (nilfs_tree_is_busy(s->s_root)) { + if ((flags ^ s->s_flags) & MS_RDONLY) { printk(KERN_ERR "NILFS: the device already " "has a %s mount.\n", (s->s_flags & MS_RDONLY) ? @@ -1343,8 +1332,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, err = -EBUSY; goto failed_super; } - } - if (!busy) { + } else { /* * Try remount to setup mount states if the current * tree is not mounted and only snapshots use this sb. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 79736a28d84f..94417a85ce6e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -565,9 +565,7 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private, - int ret, - bool is_async) + void *private) { struct inode *inode = file_inode(iocb->ki_filp); int level; @@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); } /* @@ -1757,7 +1751,7 @@ try_again: goto out; } else if (ret == 1) { clusters_need = wc->w_clen; - ret = ocfs2_refcount_cow(inode, filp, di_bh, + ret = ocfs2_refcount_cow(inode, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index eb760d8acd50..30544ce8e9f7 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2153,11 +2153,9 @@ int ocfs2_empty_dir(struct inode *inode) { int ret; struct ocfs2_empty_dir_priv priv = { - .ctx.actor = ocfs2_empty_dir_filldir + .ctx.actor = ocfs2_empty_dir_filldir, }; - memset(&priv, 0, sizeof(priv)); - if (ocfs2_dir_indexed(inode)) { ret = ocfs2_empty_dir_dx(inode, &priv); if (ret) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f223ca4..3261d71319ee 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); out: return status; @@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); @@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 96f9ac237e86..0a992737dcaf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb) + bits_wanted; + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index f1fc172175b6..452068b45749 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle, u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); - ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, p_cpos, new_p_cpos, len); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f6b96a09615..a70d604593b6 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,7 +49,6 @@ struct ocfs2_cow_context { struct inode *inode; - struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -66,7 +65,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) } int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file_inode(file); - struct ocfs2_caching_info *ci = INODE_CACHE(inode); - struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; @@ -2978,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, - &file->f_ra, file, - page, page_index, - readahead_pages); - } - if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3004,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(inode, handle, from, to, + ocfs2_map_and_dirty_page(inode, + handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: @@ -3020,12 +3011,11 @@ unlock: } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -3150,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context->file, + ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); @@ -3428,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } -static void ocfs2_readahead_for_cow(struct inode *inode, - struct file *file, - u32 start, u32 len) -{ - struct address_space *mapping; - pgoff_t index; - unsigned long num_pages; - int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; - - if (!file) - return; - - mapping = file->f_mapping; - num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; - if (!num_pages) - num_pages = 1; - - index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; - page_cache_sync_readahead(mapping, &file->f_ra, file, - index, num_pages); -} - /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3485,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); - ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); - context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; @@ -3508,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; - context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); @@ -3537,7 +3501,6 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3557,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 7754608c83a4..6422bbcdb525 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *credits, int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, - struct file *filep, struct buffer_head *di_bh, + struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, @@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_cow_sync_writeback(struct super_block *sb, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 854d80955bf8..121da2dc3be8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1022,7 +1022,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; - char nodestr[8]; + char nodestr[12]; struct ocfs2_blockcheck_stats stats; trace_ocfs2_fill_super(sb, data, silent); diff --git a/fs/open.c b/fs/open.c index d53e29895082..8070825b285b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -485,14 +485,13 @@ out_unlock: SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct file * file; + struct fd f = fdget(fd); int err = -EBADF; - file = fget(fd); - if (file) { - audit_inode(NULL, file->f_path.dentry, 0); - err = chmod_common(&file->f_path, mode); - fput(file); + if (f.file) { + audit_inode(NULL, f.file->f_path.dentry, 0); + err = chmod_common(&f.file->f_path, mode); + fdput(f); } return err; } @@ -823,7 +822,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode; - if (flags & O_CREAT) + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else op->mode = 0; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 75f2890abbd8..0ff80f9b930f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -230,8 +230,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - if (!dir_emit_dots(file, ctx)) - goto out; files = get_files_struct(p); if (!files) goto out; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 94441a407337..737e15615b04 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -271,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, de = next; } while (de); spin_unlock(&proc_subdir_lock); - return 0; + return 1; } int proc_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/root.c b/fs/proc/root.c index 229e366598da..e0a790da726d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -205,7 +205,9 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { - proc_readdir(file, ctx); + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; ctx->pos = FIRST_PROCESS_ENTRY; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbf61f6174f0..107d026f5d6e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,8 +730,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +760,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -859,7 +868,7 @@ typedef struct { } pagemap_entry_t; struct pagemapread { - int pos, len; + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool v2; }; @@ -867,7 +876,7 @@ struct pagemapread { #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(u64) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -930,8 +939,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; @@ -1116,8 +1127,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_task; pm.v2 = soft_dirty_cleared; - pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) goto out_task; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index ca71db69da07..983d9510becc 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,6 +1,8 @@ config PSTORE bool "Persistent store support" default n + select ZLIB_DEFLATE + select ZLIB_INFLATE help This option enables generic access to platform level persistent storage via "pstore" filesystem that can diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 71bf5f4ae84c..12823845d324 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -275,8 +275,8 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, - char *data, size_t size, struct timespec time, - struct pstore_info *psi) + char *data, bool compressed, size_t size, + struct timespec time, struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; @@ -315,7 +315,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, switch (type) { case PSTORE_TYPE_DMESG: - sprintf(name, "dmesg-%s-%lld", psname, id); + sprintf(name, "dmesg-%s-%lld%s", psname, id, + compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: sprintf(name, "console-%s", psname); @@ -345,9 +346,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, mutex_lock(&root->d_inode->i_mutex); - rc = -ENOSPC; dentry = d_alloc_name(root, name); - if (IS_ERR(dentry)) + if (!dentry) goto fail_lockedalloc; memcpy(private->data, data, size); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 937d820f273c..3b3d305277c4 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -50,8 +50,9 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - int count, char *data, size_t size, - struct timespec time, struct pstore_info *psi); + int count, char *data, bool compressed, + size_t size, struct timespec time, + struct pstore_info *psi); extern int pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 422962ae9fc2..4ffb7ab5e397 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -26,6 +26,7 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> +#include <linux/zlib.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> @@ -65,6 +66,15 @@ struct pstore_info *psinfo; static char *backend; +/* Compression parameters */ +#define COMPR_LEVEL 6 +#define WINDOW_BITS 12 +#define MEM_LEVEL 4 +static struct z_stream_s stream; + +static char *big_oops_buf; +static size_t big_oops_buf_sz; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -117,6 +127,121 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } EXPORT_SYMBOL_GPL(pstore_cannot_block_path); +/* Derived from logfs_compress() */ +static int pstore_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Derived from logfs_uncompress */ +static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +static void allocate_buf_for_compression(void) +{ + size_t size; + + big_oops_buf_sz = (psinfo->bufsize * 100) / 45; + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL), + zlib_inflate_workspacesize()); + stream.workspace = kmalloc(size, GFP_KERNEL); + if (!stream.workspace) { + pr_err("pstore: No memory for compression workspace; " + "skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; " + "skipping compression\n"); + stream.workspace = NULL; + } + +} + +/* + * Called when compression fails, since the printk buffer + * would be fetched for compression calling it again when + * compression fails would have moved the iterator of + * printk buffer which results in fetching old contents. + * Copy the recent messages from big_oops_buf to psinfo->buf + */ +static size_t copy_kmsg_to_buffer(int hsize, size_t len) +{ + size_t total_len; + size_t diff; + + total_len = hsize + len; + + if (total_len > psinfo->bufsize) { + diff = total_len - psinfo->bufsize + hsize; + memcpy(psinfo->buf, big_oops_buf, hsize); + memcpy(psinfo->buf + hsize, big_oops_buf + diff, + psinfo->bufsize - hsize); + total_len = psinfo->bufsize; + } else + memcpy(psinfo->buf, big_oops_buf, total_len); + + return total_len; +} + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -148,22 +273,56 @@ static void pstore_dump(struct kmsg_dumper *dumper, char *dst; unsigned long size; int hsize; + int zipped_len = -1; size_t len; + bool compressed; + size_t total_len; - dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); - size = psinfo->bufsize - hsize; - dst += hsize; + if (big_oops_buf) { + dst = big_oops_buf; + hsize = sprintf(dst, "%s#%d Part%d\n", why, + oopscount, part); + size = big_oops_buf_sz - hsize; - if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len)) - break; + if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, + size, &len)) + break; + + zipped_len = pstore_compress(dst, psinfo->buf, + hsize + len, psinfo->bufsize); + + if (zipped_len > 0) { + compressed = true; + total_len = zipped_len; + } else { + pr_err("pstore: compression failed for Part %d" + " returned %d\n", part, zipped_len); + pr_err("pstore: Capture uncompressed" + " oops/panic report of Part %d\n", part); + compressed = false; + total_len = copy_kmsg_to_buffer(hsize, len); + } + } else { + dst = psinfo->buf; + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, + part); + size = psinfo->bufsize - hsize; + dst += hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst, + size, &len)) + break; + + compressed = false; + total_len = hsize + len; + } ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, hsize, hsize + len, psinfo); + oopscount, compressed, total_len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - total += hsize + len; + total += total_len; part++; } if (pstore_cannot_block_path(reason)) { @@ -221,10 +380,10 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t hsize, size_t size, + bool compressed, size_t size, struct pstore_info *psi) { - return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, + return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, size, psi); } @@ -261,6 +420,8 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } + allocate_buf_for_compression(); + if (pstore_is_mounted()) pstore_get_records(0); @@ -297,6 +458,8 @@ void pstore_get_records(int quiet) enum pstore_type_id type; struct timespec time; int failed = 0, rc; + bool compressed; + int unzipped_len = -1; if (!psi) return; @@ -305,11 +468,32 @@ void pstore_get_records(int quiet) if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) { + while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, + psi)) > 0) { + if (compressed && (type == PSTORE_TYPE_DMESG)) { + if (big_oops_buf) + unzipped_len = pstore_decompress(buf, + big_oops_buf, size, + big_oops_buf_sz); + + if (unzipped_len > 0) { + buf = big_oops_buf; + size = unzipped_len; + compressed = false; + } else { + pr_err("pstore: decompression failed;" + "returned %d\n", unzipped_len); + compressed = true; + } + } rc = pstore_mkfile(type, psi->name, id, count, buf, - (size_t)size, time, psi); - kfree(buf); - buf = NULL; + compressed, (size_t)size, time, psi); + if (unzipped_len < 0) { + /* Free buffer other than big oops */ + kfree(buf); + buf = NULL; + } else + unzipped_len = -1; if (rc && (rc != -EEXIST || !quiet)) failed++; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index a6119f9469e2..fa8cef2cca3a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -131,9 +131,31 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return prz; } +static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, + bool *compressed) +{ + char data_type; + + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + &time->tv_sec, &time->tv_nsec, &data_type) == 3) { + if (data_type == 'C') + *compressed = true; + else + *compressed = false; + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", + &time->tv_sec, &time->tv_nsec) == 2) { + *compressed = false; + } else { + time->tv_sec = 0; + time->tv_nsec = 0; + *compressed = false; + } +} + static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, - char **buf, struct pstore_info *psi) + char **buf, bool *compressed, + struct pstore_info *psi) { ssize_t size; ssize_t ecc_notice_size; @@ -152,10 +174,6 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz) return 0; - /* TODO(kees): Bogus time for the moment. */ - time->tv_sec = 0; - time->tv_nsec = 0; - size = persistent_ram_old_size(prz); /* ECC correction notice */ @@ -166,12 +184,14 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, return -ENOMEM; memcpy(*buf, persistent_ram_old(prz), size); + ramoops_read_kmsg_hdr(*buf, time, compressed); persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); return size + ecc_notice_size; } -static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) +static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, + bool compressed) { char *hdr; struct timespec timestamp; @@ -182,8 +202,9 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) timestamp.tv_sec = 0; timestamp.tv_nsec = 0; } - hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), + compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -196,7 +217,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, - size_t hsize, size_t size, + bool compressed, size_t size, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; @@ -242,7 +263,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, prz = cxt->przs[cxt->dump_write_cnt]; - hlen = ramoops_write_kmsg_hdr(prz); + hlen = ramoops_write_kmsg_hdr(prz, compressed); if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; persistent_ram_write(prz, buf, size); @@ -400,11 +421,11 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - if (!is_power_of_2(pdata->record_size)) + if (pdata->record_size && !is_power_of_2(pdata->record_size)) pdata->record_size = rounddown_pow_of_two(pdata->record_size); - if (!is_power_of_2(pdata->console_size)) + if (pdata->console_size && !is_power_of_2(pdata->console_size)) pdata->console_size = rounddown_pow_of_two(pdata->console_size); - if (!is_power_of_2(pdata->ftrace_size)) + if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); cxt->dump_read_cnt = 0; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbad622841f9..9a702e193538 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1094,6 +1094,14 @@ static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_rsvspace -= number; } +static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) + number = dquot->dq_dqb.dqb_curspace; + dquot->dq_dqb.dqb_rsvspace += number; + dquot->dq_dqb.dqb_curspace -= number; +} + static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { @@ -1528,6 +1536,15 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number) } EXPORT_SYMBOL(inode_claim_rsv_space); +void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_reclaim_rsv_space); + void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); @@ -1702,6 +1719,35 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) EXPORT_SYMBOL(dquot_claim_space_nodirty); /* + * Convert allocated space back to in-memory reserved quotas + */ +void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) +{ + int cnt; + + if (!dquot_active(inode)) { + inode_reclaim_rsv_space(inode, number); + return; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + /* Claim reserved quotas to allocated quotas */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + dquot_reclaim_reserved_space(inode->i_dquot[cnt], + number); + } + /* Update inode bytes */ + inode_reclaim_rsv_space(inode, number); + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(inode->i_dquot); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + return; +} +EXPORT_SYMBOL(dquot_reclaim_space_nodirty); + +/* * This operation can block, but only after everything is updated */ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 33532f79b4f7..a958444a75fc 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -19,12 +19,13 @@ /* * LOCKING: * - * We rely on new Alexander Viro's super-block locking. + * These guys are evicted from procfs as the very first step in ->kill_sb(). * */ -static int show_version(struct seq_file *m, struct super_block *sb) +static int show_version(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; char *format; if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { @@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) -static int show_super(struct seq_file *m, struct super_block *sb) +static int show_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" @@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_per_level(struct seq_file *m, struct super_block *sb) +static int show_per_level(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; @@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb) return 0; } -static int show_bitmap(struct seq_file *m, struct super_block *sb) +static int show_bitmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "free_block: %lu\n" @@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_on_disk_super(struct seq_file *m, struct super_block *sb) +static int show_on_disk_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; int hash_code = DFL(s_hash_function_code); @@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_oidmap(struct seq_file *m, struct super_block *sb) +static int show_oidmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); @@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_journal(struct seq_file *m, struct super_block *sb) +static int show_journal(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; @@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb) return 0; } -/* iterator */ -static int test_sb(struct super_block *sb, void *data) -{ - return data == sb; -} - -static int set_sb(struct super_block *sb, void *data) -{ - return -ENOENT; -} - -struct reiserfs_seq_private { - struct super_block *sb; - int (*show) (struct seq_file *, struct super_block *); -}; - -static void *r_start(struct seq_file *m, loff_t * pos) -{ - struct reiserfs_seq_private *priv = m->private; - loff_t l = *pos; - - if (l) - return NULL; - - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) - return NULL; - - up_write(&priv->sb->s_umount); - return priv->sb; -} - -static void *r_next(struct seq_file *m, void *v, loff_t * pos) -{ - ++*pos; - if (v) - deactivate_super(v); - return NULL; -} - -static void r_stop(struct seq_file *m, void *v) -{ - if (v) - deactivate_super(v); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct reiserfs_seq_private *priv = m->private; - return priv->show(m, v); -} - -static const struct seq_operations r_ops = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - static int r_open(struct inode *inode, struct file *file) { - struct reiserfs_seq_private *priv; - int ret = seq_open_private(file, &r_ops, - sizeof(struct reiserfs_seq_private)); - - if (!ret) { - struct seq_file *m = file->private_data; - priv = m->private; - priv->sb = proc_get_parent_data(inode); - priv->show = PDE_DATA(inode); - } - return ret; + return single_open(file, PDE_DATA(inode), + proc_get_parent_data(inode)); } static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, - .owner = THIS_MODULE, + .release = single_release, }; static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, - int (*func) (struct seq_file *, struct super_block *)) + int (*func) (struct seq_file *, void *)) { proc_create_data(name, 0, REISERFS_SB(sb)->procdir, &r_file_operations, func); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f8a23c3078f8..e2e202a07b31 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -499,6 +499,7 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { + reiserfs_proc_info_done(s); /* * Force any pending inode evictions to occur now. Any * inodes to be removed that have extended attributes @@ -554,8 +555,6 @@ static void reiserfs_put_super(struct super_block *s) REISERFS_SB(s)->reserved_blocks); } - reiserfs_proc_info_done(s); - reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); kfree(s->s_fs_info); diff --git a/fs/stat.c b/fs/stat.c index 04ce1ac20d20..d0ea7ef75e26 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -447,9 +447,8 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) EXPORT_SYMBOL(inode_add_bytes); -void inode_sub_bytes(struct inode *inode, loff_t bytes) +void __inode_sub_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks -= bytes >> 9; bytes &= 511; if (inode->i_bytes < bytes) { @@ -457,6 +456,14 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes) inode->i_bytes += 512; } inode->i_bytes -= bytes; +} + +EXPORT_SYMBOL(__inode_sub_bytes); + +void inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_sub_bytes(inode, bytes); spin_unlock(&inode->i_lock); } diff --git a/fs/super.c b/fs/super.c index 68307c029228..5536a95186e2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -152,15 +152,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) static const struct super_operations default_op; if (s) { - if (security_sb_alloc(s)) { - /* - * We cannot call security_sb_free() without - * security_sb_alloc() succeeding. So bail out manually - */ - kfree(s); - s = NULL; - goto out; - } + if (security_sb_alloc(s)) + goto out_free_sb; + #ifdef CONFIG_SMP s->s_files = alloc_percpu(struct list_head); if (!s->s_files) @@ -228,6 +222,7 @@ err_out: free_percpu(s->s_files); #endif destroy_sb_writers(s); +out_free_sb: kfree(s); s = NULL; goto out; @@ -414,6 +409,11 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); + if (sb->s_dio_done_wq) { + destroy_workqueue(sb->s_dio_done_wq); + sb->s_dio_done_wq = NULL; + } + if (sop->put_super) sop->put_super(sb); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 15c68f9489ae..c590cabd57bb 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -22,8 +22,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/mm.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "sysfs.h" @@ -391,7 +390,7 @@ out_unlock: return rc; } -static int open(struct inode * inode, struct file * file) +static int open(struct inode *inode, struct file *file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; @@ -435,7 +434,7 @@ static int open(struct inode * inode, struct file * file) return error; } -static int release(struct inode * inode, struct file * file) +static int release(struct inode *inode, struct file *file) { struct bin_buffer *bb = file->private_data; @@ -481,7 +480,6 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @kobj: object. * @attr: attribute descriptor. */ - int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { @@ -489,19 +487,16 @@ int sysfs_create_bin_file(struct kobject *kobj, return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); } - +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. */ - void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); } - -EXPORT_SYMBOL_GPL(sysfs_create_bin_file); EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e068e744dbdd..99ec5b40e977 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -46,7 +46,7 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name) unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); - hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) ); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 1) @@ -258,7 +258,7 @@ static void sysfs_free_ino(unsigned int ino) spin_unlock(&sysfs_ino_lock); } -void release_sysfs_dirent(struct sysfs_dirent * sd) +void release_sysfs_dirent(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd; @@ -451,7 +451,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid", acxt->parent_sd->s_name, sd->s_name); return -EINVAL; } @@ -619,7 +619,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, if (!!sysfs_ns_type(parent_sd) != !!ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd)? "required": "invalid", + sysfs_ns_type(parent_sd) ? "required" : "invalid", parent_sd->s_name, name); return NULL; } @@ -674,7 +674,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, enum kobj_ns_type type, const void *ns, const char *name, struct sysfs_dirent **p_sd) { - umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; int rc; @@ -735,9 +735,9 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) /** * sysfs_create_dir - create a directory for an object. - * @kobj: object we're creating directory for. + * @kobj: object we're creating directory for. */ -int sysfs_create_dir(struct kobject * kobj) +int sysfs_create_dir(struct kobject *kobj) { enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; @@ -764,8 +764,8 @@ int sysfs_create_dir(struct kobject * kobj) return error; } -static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) { struct dentry *ret = NULL; struct dentry *parent = dentry->d_parent; @@ -857,7 +857,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) * what used to be sysfs_rmdir() below, instead of calling separately. */ -void sysfs_remove_dir(struct kobject * kobj) +void sysfs_remove_dir(struct kobject *kobj) { struct sysfs_dirent *sd = kobj->sd; @@ -896,7 +896,9 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_name = new_name; } - /* Move to the appropriate place in the appropriate directories rbtree. */ + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); @@ -988,20 +990,21 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); + if (pos) + do { + struct rb_node *node = rb_next(&pos->s_rb); + if (!node) + pos = NULL; + else + pos = to_sysfs_dirent(node); + } while (pos && pos->s_ns != ns); return pos; } static int sysfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent * parent_sd = dentry->d_fsdata; + struct sysfs_dirent *parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = file->private_data; enum kobj_ns_type type; const void *ns; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d2bb7ed8fa74..15ef5eb13663 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -20,7 +20,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/limits.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "sysfs.h" @@ -45,8 +45,8 @@ struct sysfs_open_dirent { struct sysfs_buffer { size_t count; loff_t pos; - char * page; - const struct sysfs_ops * ops; + char *page; + const struct sysfs_ops *ops; struct mutex mutex; int needs_read_fill; int event; @@ -59,16 +59,16 @@ struct sysfs_buffer { * @buffer: data buffer for file. * * Allocate @buffer->page, if it hasn't been already, then call the - * kobject's show() method to fill the buffer with this attribute's - * data. + * kobject's show() method to fill the buffer with this attribute's + * data. * This is called only once, on the file's first read unless an error * is returned. */ -static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) +static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops *ops = buffer->ops; int ret = 0; ssize_t count; @@ -106,7 +106,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer } /** - * sysfs_read_file - read an attribute. + * sysfs_read_file - read an attribute. * @file: file pointer. * @buf: buffer to fill. * @count: number of bytes to read. @@ -127,12 +127,12 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer static ssize_t sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct sysfs_buffer * buffer = file->private_data; + struct sysfs_buffer *buffer = file->private_data; ssize_t retval = 0; mutex_lock(&buffer->mutex); if (buffer->needs_read_fill || *ppos == 0) { - retval = fill_read_buffer(file->f_path.dentry,buffer); + retval = fill_read_buffer(file->f_path.dentry, buffer); if (retval) goto out; } @@ -154,9 +154,8 @@ out: * Allocate @buffer->page if it hasn't been already, then * copy the user-supplied buffer into it. */ - -static int -fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count) +static int fill_write_buffer(struct sysfs_buffer *buffer, + const char __user *buf, size_t count) { int error; @@ -167,7 +166,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t if (count >= PAGE_SIZE) count = PAGE_SIZE - 1; - error = copy_from_user(buffer->page,buf,count); + error = copy_from_user(buffer->page, buf, count); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, so e.g. sscanf() can scan the string easily */ @@ -183,16 +182,15 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t * @count: number of bytes * * Get the correct pointers for the kobject and the attribute we're - * dealing with, then call the store() method for the attribute, + * dealing with, then call the store() method for the attribute, * passing the buffer that we acquired in fill_write_buffer(). */ - -static int -flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) +static int flush_write_buffer(struct dentry *dentry, + struct sysfs_buffer *buffer, size_t count) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops *ops = buffer->ops; int rc; /* need attr_sd for attr and ops, its parent for kobj */ @@ -219,15 +217,14 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t * then push it to the kobject in flush_write_buffer(). * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come - * on the first write. + * on the first write. * Hint: if you're writing a value, first read the file, modify only the - * the value you're changing, then write entire buffer back. + * the value you're changing, then write entire buffer back. */ - -static ssize_t -sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +static ssize_t sysfs_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { - struct sysfs_buffer * buffer = file->private_data; + struct sysfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); @@ -339,13 +336,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (kobj->ktype && kobj->ktype->sysfs_ops) ops = kobj->ktype->sysfs_ops; else { - WARN(1, KERN_ERR "missing sysfs attribute operations for " - "kobject: %s\n", kobject_name(kobj)); + WARN(1, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj)); goto err_out; } /* File needs write support. - * The inode's perms must say it's ok, + * The inode's perms must say it's ok, * and we must have a store method. */ if (file->f_mode & FMODE_WRITE) { @@ -420,7 +418,7 @@ static int sysfs_release(struct inode *inode, struct file *filp) */ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) { - struct sysfs_buffer * buffer = filp->private_data; + struct sysfs_buffer *buffer = filp->private_data; struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; struct sysfs_open_dirent *od = attr_sd->s_attr.open; @@ -518,8 +516,9 @@ static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, ns = ops->namespace(kobj, attr); out: if (err) { - WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " - "kobject: %s\n", kobject_name(kobj)); + WARN(1, KERN_ERR + "missing sysfs namespace attribute operation for kobject: %s\n", + kobject_name(kobj)); } *pns = ns; return err; @@ -566,17 +565,17 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, /** * sysfs_create_file - create an attribute file for an object. - * @kobj: object we're creating for. + * @kobj: object we're creating for. * @attr: attribute descriptor. */ - -int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) +int sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); } +EXPORT_SYMBOL_GPL(sysfs_create_file); int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) { @@ -590,6 +589,7 @@ int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) sysfs_remove_file(kobj, ptr[i]); return err; } +EXPORT_SYMBOL_GPL(sysfs_create_files); /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. @@ -654,7 +654,6 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_chmod_file); - /** * sysfs_remove_file - remove an object attribute. * @kobj: object we're acting for. @@ -662,8 +661,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); * * Hash the attribute name and kill the victim. */ - -void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) +void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { const void *ns; @@ -672,13 +670,15 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) sysfs_hash_and_remove(kobj->sd, ns, attr->name); } +EXPORT_SYMBOL_GPL(sysfs_remove_file); -void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) +void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { int i; for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } +EXPORT_SYMBOL_GPL(sysfs_remove_files); /** * sysfs_remove_file_from_group - remove an attribute file from a group. @@ -793,9 +793,3 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), return 0; } EXPORT_SYMBOL_GPL(sysfs_schedule_callback); - - -EXPORT_SYMBOL_GPL(sysfs_create_file); -EXPORT_SYMBOL_GPL(sysfs_remove_file); -EXPORT_SYMBOL_GPL(sysfs_remove_files); -EXPORT_SYMBOL_GPL(sysfs_create_files); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 09a1a25cd145..5f92cd2f61c1 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -3,8 +3,10 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2013 Greg Kroah-Hartman + * Copyright (c) 2013 The Linux Foundation * - * This file is released undert the GPL v2. + * This file is released undert the GPL v2. * */ @@ -19,8 +21,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, const struct attribute_group *grp) { - struct attribute *const* attr; - struct bin_attribute *const* bin_attr; + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) @@ -33,8 +35,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, const struct attribute_group *grp, int update) { - struct attribute *const* attr; - struct bin_attribute *const* bin_attr; + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { @@ -129,6 +131,41 @@ int sysfs_create_group(struct kobject *kobj, { return internal_create_group(kobj, 0, grp); } +EXPORT_SYMBOL_GPL(sysfs_create_group); + +/** + * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups + * @kobj: The kobject to create the group on + * @groups: The attribute groups to create, NULL terminated + * + * This function creates a bunch of attribute groups. If an error occurs when + * creating a group, all previously created groups will be removed, unwinding + * everything back to the original state when this function was called. + * It will explicitly warn and error if any of the attribute files being + * created already exist. + * + * Returns 0 on success or error code from sysfs_create_group on error. + */ +int sysfs_create_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int error = 0; + int i; + + if (!groups) + return 0; + + for (i = 0; groups[i]; i++) { + error = sysfs_create_group(kobj, groups[i]); + if (error) { + while (--i >= 0) + sysfs_remove_group(kobj, groups[i]); + break; + } + } + return error; +} +EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group @@ -152,11 +189,18 @@ int sysfs_update_group(struct kobject *kobj, { return internal_create_group(kobj, 1, grp); } +EXPORT_SYMBOL_GPL(sysfs_update_group); - - -void sysfs_remove_group(struct kobject * kobj, - const struct attribute_group * grp) +/** + * sysfs_remove_group: remove a group from a kobject + * @kobj: kobject to remove the group from + * @grp: group to remove + * + * This function removes a group of attributes from a kobject. The attributes + * previously have to have been created for this group, otherwise it will fail. + */ +void sysfs_remove_group(struct kobject *kobj, + const struct attribute_group *grp) { struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; @@ -164,8 +208,9 @@ void sysfs_remove_group(struct kobject * kobj, if (grp->name) { sd = sysfs_get_dirent(dir_sd, NULL, grp->name); if (!sd) { - WARN(!sd, KERN_WARNING "sysfs group %p not found for " - "kobject '%s'\n", grp, kobject_name(kobj)); + WARN(!sd, KERN_WARNING + "sysfs group %p not found for kobject '%s'\n", + grp, kobject_name(kobj)); return; } } else @@ -177,6 +222,27 @@ void sysfs_remove_group(struct kobject * kobj, sysfs_put(sd); } +EXPORT_SYMBOL_GPL(sysfs_remove_group); + +/** + * sysfs_remove_groups - remove a list of groups + * + * @kobj: The kobject for the groups to be removed from + * @groups: NULL terminated list of groups to be removed + * + * If groups is not NULL, remove the specified groups from the kobject. + */ +void sysfs_remove_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int i; + + if (!groups) + return; + for (i = 0; groups[i]; i++) + sysfs_remove_group(kobj, groups[i]); +} +EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing attribute group. @@ -273,7 +339,3 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); - -EXPORT_SYMBOL_GPL(sysfs_create_group); -EXPORT_SYMBOL_GPL(sysfs_update_group); -EXPORT_SYMBOL_GPL(sysfs_remove_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 3e2837a633ed..963f910c8034 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -10,7 +10,7 @@ * Please see Documentation/filesystems/sysfs.txt for more information. */ -#undef DEBUG +#undef DEBUG #include <linux/pagemap.h> #include <linux/namei.h> @@ -36,7 +36,7 @@ static struct backing_dev_info sysfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static const struct inode_operations sysfs_inode_operations ={ +static const struct inode_operations sysfs_inode_operations = { .permission = sysfs_permission, .setattr = sysfs_setattr, .getattr = sysfs_getattr, @@ -67,7 +67,7 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) return attrs; } -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) { struct sysfs_inode_attrs *sd_attrs; struct iattr *iattrs; @@ -128,7 +128,8 @@ out: return error; } -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) +static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, + u32 *secdata_len) { struct sysfs_inode_attrs *iattrs; void *old_secdata; @@ -186,13 +187,13 @@ out: return error; } -static inline void set_default_inode_attr(struct inode * inode, umode_t mode) +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } -static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) { inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; @@ -220,7 +221,8 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) set_nlink(inode, sd->s_dir.subdirs + 2); } -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct sysfs_dirent *sd = dentry->d_fsdata; struct inode *inode = dentry->d_inode; @@ -285,7 +287,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) * RETURNS: * Pointer to allocated inode on success, NULL on failure. */ -struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) +struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) { struct inode *inode; @@ -312,7 +314,8 @@ void sysfs_evict_inode(struct inode *inode) sysfs_put(sd); } -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, + const char *name) { struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index afd83273e6ce..fd7ce7a39f91 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -64,7 +64,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { - pr_debug("%s: could not get root dentry!\n",__func__); + pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } root->d_fsdata = &sysfs_root; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 8c940df97a52..2dd4507d9edd 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -125,6 +125,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target, { return sysfs_do_create_link(kobj, target, name, 1); } +EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. @@ -166,8 +167,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * @kobj: object we're acting for. * @name: name of the symlink to remove. */ - -void sysfs_remove_link(struct kobject * kobj, const char * name) +void sysfs_remove_link(struct kobject *kobj, const char *name) { struct sysfs_dirent *parent_sd = NULL; @@ -178,6 +178,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) sysfs_hash_and_remove(parent_sd, NULL, name); } +EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link - rename symlink in object's directory. @@ -223,6 +224,7 @@ out: sysfs_put(sd); return result; } +EXPORT_SYMBOL_GPL(sysfs_rename_link); static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd, char *path) @@ -276,7 +278,7 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, return 0; } -static int sysfs_getlink(struct dentry *dentry, char * path) +static int sysfs_getlink(struct dentry *dentry, char *path) { struct sysfs_dirent *sd = dentry->d_fsdata; struct sysfs_dirent *parent_sd = sd->s_parent; @@ -295,7 +297,7 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); if (page) { - error = sysfs_getlink(dentry, (char *) page); + error = sysfs_getlink(dentry, (char *) page); if (error < 0) free_page((unsigned long)page); } @@ -303,7 +305,8 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) { char *page = nd_get_link(nd); if (!IS_ERR(page)) @@ -319,8 +322,3 @@ const struct inode_operations sysfs_symlink_inode_operations = { .getattr = sysfs_getattr, .permission = sysfs_permission, }; - - -EXPORT_SYMBOL_GPL(sysfs_create_link); -EXPORT_SYMBOL_GPL(sysfs_remove_link); -EXPORT_SYMBOL_GPL(sysfs_rename_link); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d1e4043eb0c3..b6deca3e301d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -78,7 +78,7 @@ struct sysfs_dirent { }; unsigned short s_flags; - umode_t s_mode; + umode_t s_mode; unsigned int s_ino; struct sysfs_inode_attrs *s_iattr; }; @@ -123,9 +123,9 @@ do { \ key = &attr->skey; \ \ lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while(0) +} while (0) #else -#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#define sysfs_dirent_init_lockdep(sd) do {} while (0) #endif /* @@ -186,8 +186,8 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); -int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); +int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, + const void *ns, const char *new_name); static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { @@ -214,10 +214,12 @@ void sysfs_evict_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); int sysfs_permission(struct inode *inode, int mask); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); + size_t size, int flags); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, + const char *name); int sysfs_inode_init(void); /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 596ec71da00e..e11d654af786 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,14 +86,6 @@ xfs_destroy_ioend( bh->b_end_io(bh, !ioend->io_error); } - if (ioend->io_iocb) { - inode_dio_done(ioend->io_inode); - if (ioend->io_isasync) { - aio_complete(ioend->io_iocb, ioend->io_error ? - ioend->io_error : ioend->io_result, 0); - } - } - mempool_free(ioend, xfs_ioend_pool); } @@ -281,7 +273,6 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); - ioend->io_isasync = 0; ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; @@ -291,8 +282,6 @@ xfs_alloc_ioend( ioend->io_buffer_tail = NULL; ioend->io_offset = 0; ioend->io_size = 0; - ioend->io_iocb = NULL; - ioend->io_result = 0; ioend->io_append_trans = NULL; INIT_WORK(&ioend->io_work, xfs_end_io); @@ -1292,8 +1281,10 @@ __xfs_get_blocks( if (create || !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); if (create && ISUNWRITTEN(&imap)) { - if (direct) + if (direct) { bh_result->b_private = inode; + set_buffer_defer_completion(bh_result); + } set_buffer_unwritten(bh_result); } } @@ -1390,9 +1381,7 @@ xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private, - int ret, - bool is_async) + void *private) { struct xfs_ioend *ioend = iocb->private; @@ -1414,17 +1403,10 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; - ioend->io_iocb = iocb; - ioend->io_result = ret; if (private && size > 0) ioend->io_type = XFS_IO_UNWRITTEN; - if (is_async) { - ioend->io_isasync = 1; - xfs_finish_ioend(ioend); - } else { - xfs_finish_ioend_sync(ioend); - } + xfs_finish_ioend_sync(ioend); } STATIC ssize_t diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index c325abb8d61a..f94dd459dff9 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -45,7 +45,6 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - unsigned int io_isasync : 1; /* needs aio_complete */ unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ @@ -54,8 +53,6 @@ typedef struct xfs_ioend { xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ - struct kiocb *io_iocb; - int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; |