diff options
Diffstat (limited to 'fs')
142 files changed, 4953 insertions, 3071 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 370b24cee4d8..c055d56ec63d 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -30,6 +30,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_RANDOMIZE_PIE bool +config ARCH_BINFMT_ELF_STATE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a6175fe10c0..02b16910f4c9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -386,6 +386,127 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) ELF_PAGESTART(cmds[first_idx].p_vaddr); } +/** + * load_elf_phdrs() - load ELF program headers + * @elf_ex: ELF header of the binary whose program headers should be loaded + * @elf_file: the opened ELF binary file + * + * Loads ELF program headers from the binary file elf_file, which has the ELF + * header pointed to by elf_ex, into a newly allocated array. The caller is + * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + */ +static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, + struct file *elf_file) +{ + struct elf_phdr *elf_phdata = NULL; + int retval, size, err = -1; + + /* + * If the size of this structure has changed, then punt, since + * we will be doing the wrong thing. + */ + if (elf_ex->e_phentsize != sizeof(struct elf_phdr)) + goto out; + + /* Sanity check the number of program headers... */ + if (elf_ex->e_phnum < 1 || + elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) + goto out; + + /* ...and their total size. */ + size = sizeof(struct elf_phdr) * elf_ex->e_phnum; + if (size > ELF_MIN_ALIGN) + goto out; + + elf_phdata = kmalloc(size, GFP_KERNEL); + if (!elf_phdata) + goto out; + + /* Read in the program headers */ + retval = kernel_read(elf_file, elf_ex->e_phoff, + (char *)elf_phdata, size); + if (retval != size) { + err = (retval < 0) ? retval : -EIO; + goto out; + } + + /* Success! */ + err = 0; +out: + if (err) { + kfree(elf_phdata); + elf_phdata = NULL; + } + return elf_phdata; +} + +#ifndef CONFIG_ARCH_BINFMT_ELF_STATE + +/** + * struct arch_elf_state - arch-specific ELF loading state + * + * This structure is used to preserve architecture specific data during + * the loading of an ELF file, throughout the checking of architecture + * specific ELF headers & through to the point where the ELF load is + * known to be proceeding (ie. SET_PERSONALITY). + * + * This implementation is a dummy for architectures which require no + * specific state. + */ +struct arch_elf_state { +}; + +#define INIT_ARCH_ELF_STATE {} + +/** + * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @phdr: The program header to check + * @elf: The open ELF file + * @is_interp: True if the phdr is from the interpreter of the ELF being + * loaded, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Inspects the program header phdr to validate its correctness and/or + * suitability for the system. Called once per ELF program header in the + * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its + * interpreter. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_elf_pt_proc(struct elfhdr *ehdr, + struct elf_phdr *phdr, + struct file *elf, bool is_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +/** + * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @has_interp: True if the ELF has an interpreter, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Provides a final opportunity for architecture code to reject the loading + * of the ELF & cause an exec syscall to return an error. This is called after + * all program headers to be checked by arch_elf_pt_proc have been. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function @@ -394,16 +515,15 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long *interp_map_addr, - unsigned long no_base) + unsigned long no_base, struct elf_phdr *interp_elf_phdata) { - struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; unsigned long error = ~0UL; unsigned long total_size; - int retval, i, size; + int i; /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && @@ -414,40 +534,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, if (!interpreter->f_op->mmap) goto out; - /* - * If the size of this structure has changed, then punt, since - * we will be doing the wrong thing. - */ - if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr)) - goto out; - if (interp_elf_ex->e_phnum < 1 || - interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - - /* Now read in all of the header information */ - size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum; - if (size > ELF_MIN_ALIGN) - goto out; - elf_phdata = kmalloc(size, GFP_KERNEL); - if (!elf_phdata) - goto out; - - retval = kernel_read(interpreter, interp_elf_ex->e_phoff, - (char *)elf_phdata, size); - error = -EIO; - if (retval != size) { - if (retval < 0) - error = retval; - goto out_close; - } - - total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum); + total_size = total_mapping_size(interp_elf_phdata, + interp_elf_ex->e_phnum); if (!total_size) { error = -EINVAL; - goto out_close; + goto out; } - eppnt = elf_phdata; + eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; @@ -474,7 +568,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, *interp_map_addr = map_addr; error = map_addr; if (BAD_ADDR(map_addr)) - goto out_close; + goto out; if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) { @@ -493,7 +587,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) { error = -ENOMEM; - goto out_close; + goto out; } /* @@ -523,7 +617,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, */ if (padzero(elf_bss)) { error = -EFAULT; - goto out_close; + goto out; } /* What we have mapped so far */ @@ -532,13 +626,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) - goto out_close; + goto out; } error = load_addr; - -out_close: - kfree(elf_phdata); out: return error; } @@ -575,10 +666,9 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; char * elf_interpreter = NULL; unsigned long error; - struct elf_phdr *elf_ppnt, *elf_phdata; + struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; int retval, i; - unsigned int size; unsigned long elf_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; @@ -589,6 +679,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; + struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; loc = kmalloc(sizeof(*loc), GFP_KERNEL); if (!loc) { @@ -611,26 +702,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!bprm->file->f_op->mmap) goto out; - /* Now read in all of the header information */ - if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr)) - goto out; - if (loc->elf_ex.e_phnum < 1 || - loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr); - retval = -ENOMEM; - elf_phdata = kmalloc(size, GFP_KERNEL); + elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); if (!elf_phdata) goto out; - retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, - (char *)elf_phdata, size); - if (retval != size) { - if (retval >= 0) - retval = -EIO; - goto out_free_ph; - } - elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0; @@ -699,12 +774,21 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_ppnt = elf_phdata; for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) - if (elf_ppnt->p_type == PT_GNU_STACK) { + switch (elf_ppnt->p_type) { + case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break; + + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + bprm->file, false, + &arch_state); + if (retval) + goto out_free_dentry; + break; } /* Some simple consistency checks for the interpreter */ @@ -716,8 +800,36 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; + + /* Load the interpreter program headers */ + interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interpreter); + if (!interp_elf_phdata) + goto out_free_dentry; + + /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_ppnt = interp_elf_phdata; + for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + switch (elf_ppnt->p_type) { + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->interp_elf_ex, + elf_ppnt, interpreter, + true, &arch_state); + if (retval) + goto out_free_dentry; + break; + } } + /* + * Allow arch code to reject the ELF at this point, whilst it's + * still possible to return an error to the code that invoked + * the exec syscall. + */ + retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + if (retval) + goto out_free_dentry; + /* Flush all traces of the currently running executable */ retval = flush_old_exec(bprm); if (retval) @@ -725,7 +837,7 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex); + SET_PERSONALITY2(loc->elf_ex, &arch_state); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -890,7 +1002,7 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, &interp_map_addr, - load_bias); + load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -917,6 +1029,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } } + kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); @@ -981,6 +1094,7 @@ out_ret: /* error cleanup */ out_free_dentry: + kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) fput(interpreter); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index cb7f3fe9c9f6..d897ef803b3b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,6 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "hash.h" @@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state, static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfsic_block_data_ctx *block_ctx_out, int mirror_num); -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out); static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); @@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block( l = NULL; next_block->generation = BTRFSIC_GENERATION_UNKNOWN; } else { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block), - next_block->logical_bytenr); - } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block), + next_block->logical_bytenr); + else + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block)); + } next_block->logical_bytenr = next_bytenr; next_block->mirror_num = *mirror_nump; @@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data( return -1; } if (!block_was_created) { - if (next_block->logical_bytenr != next_bytenr && + if ((state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) && + next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { printk(KERN_INFO @@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out) -{ - block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); - block_ctx_out->dev_bytenr = bytenr; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - if (NULL != block_ctx_out->dev) { - return 0; - } else { - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); - return -ENXIO; - } -} - static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { if (block_ctx->mem_to_free) { @@ -1901,25 +1883,26 @@ again: dev_state, dev_bytenr); } - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch" - " (!= stored %llu).\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block), - block->logical_bytenr); - else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + bytenr, dev_state->name, + dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, + block), + block->logical_bytenr); + else + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", + bytenr, dev_state->name, + dev_bytenr, block->mirror_num, + btrfsic_get_block_type(state, + block)); + } block->logical_bytenr = bytenr; } else { if (num_pages * PAGE_CACHE_SIZE < @@ -2002,24 +1985,13 @@ again: } } - if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, - processed_len, - bdev, &block_ctx); - else - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", bytenr); - goto continue_loop; - } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; if (is_metadata || state->include_extent_data) { block->never_written = 0; @@ -2133,10 +2105,6 @@ again: /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.mem_to_free = NULL; - block_ctx.pagev = NULL; } else { processed_len = state->metablock_size; bytenr = btrfs_stack_header_bytenr( @@ -2149,22 +2117,15 @@ again: "Written block @%llu (%s/%llu/?)" " !found in hash table, M.\n", bytenr, dev_state->name, dev_bytenr); - - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", - dev_bytenr); - goto continue_loop; - } } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ + block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; block = btrfsic_block_alloc(); if (NULL == block) { @@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root, root->sectorsize, PAGE_CACHE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_NOFS); - if (NULL == state) { - printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); - return -1; + state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!state) { + state = vzalloc(sizeof(*state)); + if (!state) { + printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); + return -1; + } } if (!btrfsic_is_initialized) { @@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - kfree(state); + if (is_vmalloc_addr(state)) + vfree(state); + else + kfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index dcd9be32ac57..e9df8862012c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -224,16 +224,19 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, + const struct compressed_bio *cb) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = cb->start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; int ret; + if (cb->errors) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, continue; } for (i = 0; i < ret; i++) { + if (cb->errors) + SetPageError(pages[i]); end_page_writeback(pages[i]); page_cache_release(pages[i]); } @@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err) tree->ops->writepage_end_io_hook(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, - NULL, 1); + NULL, + err ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; - end_compressed_writeback(inode, cb->start, cb->len); + end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ /* diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 150822ee0a0b..14a72ed14ef7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2929,7 +2929,7 @@ done: */ if (!p->leave_spinning) btrfs_set_path_blocking(p); - if (ret < 0) + if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..e6fbbd74b716 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -607,6 +607,7 @@ struct btrfs_path { unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; }; /* @@ -1170,6 +1171,7 @@ struct btrfs_space_info { struct percpu_counter total_bytes_pinned; struct list_head list; + struct list_head ro_bgs; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache { unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; int disk_cache_state; @@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache { /* For delayed block group creation or deletion of empty block groups */ struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; }; /* delayed seq elem */ @@ -1402,6 +1411,11 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; + /* + * Track requests for actions that need to be done during transaction + * commit (like for some mount options). + */ + unsigned long pending_changes; unsigned long compress_type:4; int commit_interval; /* @@ -1729,6 +1743,12 @@ struct btrfs_fs_info { /* For btrfs to record security options */ struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; }; struct btrfs_subvolume_writers { @@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) + #define btrfs_set_and_info(root, opt, fmt, args...) \ { \ if (!btrfs_test_opt(root, opt)) \ @@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args { } /* + * Requests for changes that need to be done during transaction commit. + * + * Internal mount options that are used for special handling of the real + * mount options (eg. cannot be set during remount and have to be set during + * transaction commit) + */ + +#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) +#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) +#define BTRFS_PENDING_COMMIT (2) + +#define btrfs_test_pending(info, opt) \ + test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_set_pending(info, opt) \ + set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_clear_pending(info, opt) \ + clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) + +/* + * Helpers for setting pending mount option changes. + * + * Expects corresponding macros + * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name + */ +#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), SET_##opt); \ + btrfs_clear_pending((info), CLEAR_##opt); \ + } \ +} while(0) + +#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), CLEAR_##opt); \ + btrfs_clear_pending((info), SET_##opt); \ + } \ +} while(0) + +/* * Inode flags */ #define BTRFS_INODE_NODATASUM (1 << 0) @@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); + struct btrfs_root *root, u64 group_start, + struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_nocow_write(struct btrfs_root *root); -void btrfs_end_nocow_write(struct btrfs_root *root); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, + int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} /* reada.c */ struct reada_control { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..ca6a3a3b6b6c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - if (btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EOPNOTSUPP; - } - switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: @@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; @@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } printk_in_rcu(KERN_INFO @@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); - btrfs_dev_replace_unlock(dev_replace); btrfs_rm_dev_replace_blocked(fs_info); - btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); btrfs_rm_dev_replace_unblocked(fs_info); @@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); if (!IS_ERR(trans)) @@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) percpu_counter_inc(&fs_info->bio_counter); } -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_dec(&fs_info->bio_counter); + percpu_counter_sub(&fs_info->bio_counter, amount); if (waitqueue_active(&fs_info->replace_wait)) wake_up(&fs_info->replace_wait); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df866e919..1752625fb4dd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,10 +21,6 @@ #include "hash.h" #include "transaction.h" -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); - /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..30965120772b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + INIT_LIST_HEAD(&fs_info->pinned_chunks); + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2830,9 +2832,11 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } - /* Set the real inode map cache flag */ - if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) - btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + /* + * Mount does not set all options immediatelly, we can do it now and do + * not have to wait for transaction commit + */ + btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { @@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, */ if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->root); + btrfs_super_root(sb)); if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->chunk_root); + printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", + btrfs_super_chunk_root(sb)); if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { @@ -4129,6 +4144,25 @@ again: return 0; } +static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); + btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47c1ba141082..222d6aea4a8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache) struct btrfs_caching_control *ctl; spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; @@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; } else { if (load_cache_only) { cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } } spin_unlock(&cache->lock); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); @@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } spin_unlock(&cache->lock); wake_up(&caching_ctl->wait); @@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { @@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); /* * No longer have used bytes in this block group, queue * it for deletion. @@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root, } spin_unlock(&info->unused_bgs_lock); } - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro = 1; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } out: @@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, /* * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account. */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; u64 free_bytes = 0; int factor; - list_for_each_entry(block_group, groups_list, list) { + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { @@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) spin_unlock(&block_group->lock); } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - spin_unlock(&sinfo->lock); return free_bytes; @@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, cache->bytes_super - btrfs_block_group_used(&cache->item); sinfo->bytes_readonly -= num_bytes; cache->ro = 0; + list_del_init(&cache->ro_list); spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, int ret = 0; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - list_del_init(&block_group->bg_list); if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } } @@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int ret; int index; int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); if (root->fs_info->first_logical_byte == block_group->key.objectid) root->fs_info->first_logical_byte = (u64)-1; @@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + list_del_init(&block_group->ro_list); if (list_empty(&block_group->space_info->block_groups[index])) { kobj = block_group->space_info->block_group_kobjs[index]; block_group->space_info->block_group_kobjs[index] = NULL; @@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } btrfs_remove_free_space_cache(block_group); @@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; - clear_extent_bits(&fs_info->freed_extents[0], start, end, + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); - clear_extent_bits(&fs_info->freed_extents[1], start, end, + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; @@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); +end_trans: btrfs_end_transaction(trans, root); next: btrfs_put_block_group(block_group); @@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) { if (atomic_read(&root->will_be_snapshoted)) return 0; @@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) */ smp_mb(); if (atomic_read(&root->will_be_snapshoted)) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); return 0; } return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf3f424e0013..4ebabd237153 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, clear = 1; again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; } spin_lock(&tree->lock); @@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree, state->state |= bits_to_set; } -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + const u64 flags) { if (cached_ptr && !(*cached_ptr)) { - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + if (!flags || (state->state & flags)) { *cached_ptr = state; atomic_inc(&state->refs); } } } +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_IOBITS | EXTENT_BOUNDARY); +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) + if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1234,6 +1255,7 @@ search_again: spin_unlock(&tree->lock); if (mask & __GFP_WAIT) cond_resched(); + first_iteration = false; goto again; } @@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, state = find_first_extent_bit_state(tree, start, bits); got_it: if (state) { - cache_state(state, cached_state); + cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; ret = 0; @@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, if (page_ops == 0) return 0; + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_page_dirty_for_io(pages[i]); if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + if (page_ops & PAGE_SET_ERROR) + SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d4b938be986..ece9ce87edff 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -49,6 +49,7 @@ #define PAGE_SET_WRITEBACK (1 << 2) #define PAGE_END_WRITEBACK (1 << 3) #define PAGE_SET_PRIVATE2 (1 << 4) +#define PAGE_SET_ERROR (1 << 5) /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 225302b39afb..6a98bddd8f33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) - list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a18ceabd99a8..e4090259569b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_nocow_write(root); + ret = btrfs_start_write_no_snapshoting(root); if (!ret) return -ENOSPC; @@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_free_reserved_data_space(inode, reserve_bytes); else - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); break; } @@ -1632,7 +1632,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1661,7 +1661,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, release_bytes); @@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, err = written_buffered; goto out; } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; @@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int ret; atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); return ret; @@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..030847bf7cec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,10 +27,17 @@ #include "disk-io.h" #include "extent_io.h" #include "inode-map.h" +#include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +struct btrfs_trim_range { + u64 start; + u64 bytes; + struct list_head list; +}; + static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, int ret; struct btrfs_free_cluster *cluster = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); + struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { @@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, cluster = NULL; } } + + /* + * Make sure we don't miss any range that was removed from our rbtree + * because trimming is running. Otherwise after a umount+mount (or crash + * after committing the transaction) we would leak free space and get + * an inconsistent free space cache report from fsck. + */ + list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { + ret = io_ctl_add_entry(io_ctl, trim_entry->start, + trim_entry->bytes, NULL); + if (ret) + goto fail; + *entries += 1; + } + return 0; fail: return -ENOSPC; @@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_set_generation(&io_ctl, trans->transid); + mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ ret = write_cache_extent_entries(&io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } /* * Some spaces that are freed in the current transaction are pinned, @@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * committed, we shouldn't lose them. */ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } - /* At last, we write out all the bitmaps. */ + /* + * At last, we write out all the bitmaps and keep cache_writeout_mutex + * locked while doing it because a concurrent trim can be manipulating + * or freeing the bitmap. + */ ret = write_bitmap_entries(&io_ctl, &bitmap_list); + mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; @@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) ctl->start = block_group->key.objectid; ctl->private = block_group; ctl->op = &free_space_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * we only want to have 32k of ram per block group for keeping @@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group_cache *block_group, u64 *total_trimmed, u64 start, u64 bytes, - u64 reserved_start, u64 reserved_bytes) + u64 reserved_start, u64 reserved_bytes, + struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; u64 trimmed = 0; @@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, if (!ret) *total_trimmed += trimmed; + mutex_lock(&ctl->cache_writeout_mutex); btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + list_del(&trim_entry->list); + mutex_unlock(&ctl->cache_writeout_mutex); if (update) { spin_lock(&space_info->lock); @@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, u64 bytes; while (start < end) { + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, start, 0, 1); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, node = rb_next(&entry->offset_index); if (!node) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto out; } entry = rb_entry(node, struct btrfs_free_space, @@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, if (entry->offset >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, kmem_cache_free(btrfs_free_space_cachep, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = extent_start; + trim_entry.bytes = extent_bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes); + extent_start, extent_bytes, &trim_entry); if (ret) break; next: @@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, while (offset < end) { bool next_bitmap = false; + struct btrfs_trim_range trim_entry; + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, ret2 = search_bitmap(ctl, entry, &start, &bytes); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, bytes = min(bytes, end - start); if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, free_bitmap(ctl, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = start; + trim_entry.bytes = bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes); + start, bytes, &trim_entry); if (ret) break; next: @@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, *trimmed = 0; + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); if (ret) - return ret; + goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + lock_chunks(block_group->fs_info->chunk_root); + list_del_init(&em->list); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We've left one free space entry and other tasks trimming + * this block group have left 1 entry each one. Free them. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } else { + spin_unlock(&block_group->lock); + } return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 0cf4977ef70d..88b2238a0aed 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -38,6 +38,8 @@ struct btrfs_free_space_ctl { u64 start; struct btrfs_free_space_op *op; void *private; + struct mutex cache_writeout_mutex; + struct list_head trimming_ranges; }; struct btrfs_free_space_op { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..74faea3a516e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, "disabling inode map caching"); } } @@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) ctl->start = 0; ctl->private = NULL; ctl->op = &free_ino_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * Initially we allow to use 16K of ram to cache chunks of diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ff0dcc016b71..e687bb0dc73a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode) * are written in the same order that the flusher thread sent them * down. */ -static noinline int compress_file_range(struct inode *inode, +static noinline void compress_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, struct async_cow *async_cow, @@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); - /* - * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. - */ - if ((end - start + 1) <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - goto cleanup_and_bail_uncompressed; - actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -440,6 +432,14 @@ again: total_compressed = actual_end - start; + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if (total_compressed <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + /* we want to make sure that amount of ram required to uncompress * an extent is reasonable, so we limit the total size in ram * of a compressed extent to 128k. This is a crucial number @@ -527,7 +527,10 @@ cont: if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DEFRAG; + unsigned long page_error_op; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, @@ -538,6 +541,7 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + page_error_op | PAGE_END_WRITEBACK); goto free_pages_out; } @@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed: *num_added += 1; } -out: - return ret; + return; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -629,8 +632,22 @@ free_pages_out: page_cache_release(pages[i]); } kfree(pages); +} - goto out; +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent->pages) + return; + + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; } /* @@ -639,7 +656,7 @@ free_pages_out: * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline int submit_compressed_extents(struct inode *inode, +static noinline void submit_compressed_extents(struct inode *inode, struct async_cow *async_cow) { struct async_extent *async_extent; @@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode, struct extent_io_tree *io_tree; int ret = 0; - if (list_empty(&async_cow->extents)) - return 0; - again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -709,15 +723,7 @@ retry: async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); if (ret) { - int i; - - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); - } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + free_async_extent_pages(async_extent); if (ret == -ENOSPC) { unlock_extent(io_tree, async_extent->start, @@ -814,15 +820,26 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); + if (ret) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct page *p = async_extent->pages[0]; + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + p->mapping = inode->i_mapping; + tree->ops->writepage_end_io_hook(p, start, end, + NULL, 0); + p->mapping = NULL; + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); - if (ret) - goto out; cond_resched(); } - ret = 0; -out: - return ret; + return; out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: @@ -832,7 +849,9 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); kfree(async_extent); goto again; } @@ -1318,7 +1337,7 @@ next_slot: * we fall into common COW way. */ if (!nolock) { - err = btrfs_start_nocow_write(root); + err = btrfs_start_write_no_snapshoting(root); if (!err) goto out_check; } @@ -1342,7 +1361,7 @@ out_check: if (extent_end <= start) { path->slots[0]++; if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto next_slot; } if (!nocow) { @@ -1362,7 +1381,7 @@ out_check: page_started, nr_written, 1); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } cow_start = (u64)-1; @@ -1413,7 +1432,7 @@ out_check: num_bytes); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } } @@ -1424,7 +1443,7 @@ out_check: EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -4580,6 +4599,26 @@ next: return err; } +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +static void wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} + static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize > oldsize) { truncate_pagecache(inode, newsize); + /* + * Don't do an expanding truncate while snapshoting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) + if (ret) { + btrfs_end_write_no_snapshoting(root); return ret; + } trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_end_write_no_snapshoting(root); return PTR_ERR(trans); + } i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); + btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); } else { @@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_write_and_wait_range(inode->i_mapping, - lockstart, - lockend); + ret = btrfs_fdatawrite_range(inode, lockstart, lockend); + if (ret) + break; + ret = filemap_fdatawait_range(inode->i_mapping, + lockstart, + lockend); if (ret) break; @@ -9442,6 +9497,21 @@ out_inode: } +/* Inspired by filemap_check_errors() */ +int btrfs_inode_check_errors(struct inode *inode) +{ + int ret = 0; + + if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &inode->i_mapping->flags) && + test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) + ret = -EIO; + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 080fe66c0349..d49fe8a0f6b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -617,7 +617,7 @@ fail: return ret; } -static void btrfs_wait_nocow_write(struct btrfs_root *root) +static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) { s64 writers; DEFINE_WAIT(wait); @@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); - btrfs_wait_nocow_write(root); + btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -761,7 +732,8 @@ fail: free: kfree(pending_snapshot); out: - atomic_dec(&root->will_be_snapshoted); + if (atomic_dec_and_test(&root->will_be_snapshoted)) + wake_up_atomic_t(&root->will_be_snapshoted); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec4cc20..534544e08f76 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -431,19 +432,31 @@ out: /* Needs to either be called under a log transaction or the log_mutex */ void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list) + struct list_head *logged_list, + const loff_t start, + const loff_t end) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; + struct rb_node *prev; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + n = __tree_search(&tree->tree, end, &prev); + if (!n) + n = prev; + for (; n; n = rb_prev(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + if (ordered->file_offset > end) + continue; + if (entry_end(ordered) <= start) + break; if (!list_empty(&ordered->log_list)) continue; - list_add_tail(&ordered->log_list, logged_list); + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); @@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(&log->log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - btrfs_put_ordered_extent(ordered); + if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + ret = btrfs_fdatawrite_range(inode, start, orig_end); if (ret) return ret; - /* - * So with compression we will find and lock a dirty page and clear the - * first one as dirty, setup an async extent, and immediately return - * with the entire range locked but with nobody actually marked with - * writeback. So we can't just filemap_write_and_wait_range() and - * expect it to work since it will just kick off a thread to do the - * actual work. So we need to call filemap_fdatawrite_range _again_ - * since it will wait on the page lock, which won't be unlocked until - * after the pages have been marked as writeback and so we're good to go - * from there. We have to do this otherwise we'll miss the ordered - * extents and that results in badness. Please Josef, do not think you - * know better and pull this out at some point in the future, it is - * right and you are wrong. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - orig_end); - if (ret) - return ret; - } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); if (ret) return ret; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274d621e..e96cd4ccd805 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -71,6 +71,8 @@ struct btrfs_ordered_sum { ordered extent */ #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent + * in the logging code. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -121,6 +123,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list); + struct list_head *logged_list, + const loff_t start, + const loff_t end); void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a41631cb959..8ab2a17bbba8 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -58,9 +58,23 @@ */ #define RBIO_CACHE_READY_BIT 3 +/* + * bbio and raid_map is managed by the caller, so we shouldn't free + * them here. And besides that, all rbios with this flag should not + * be cached, because we need raid_map to check the rbios' stripe + * is the same or not, but it is very likely that the caller has + * free raid_map, so don't cache those rbios. + */ +#define RBIO_HOLD_BBIO_MAP_BIT 4 #define RBIO_CACHE_SIZE 1024 +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE = 0, + BTRFS_RBIO_READ_REBUILD = 1, + BTRFS_RBIO_PARITY_SCRUB = 2, +}; + struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; @@ -117,13 +131,16 @@ struct btrfs_raid_bio { /* number of data stripes (no p/q) */ int nr_data; + int real_stripes; + + int stripe_npages; /* * set if we're doing a parity rebuild * for a read from higher up, which is handled * differently from a parity rebuild as part of * rmw */ - int read_rebuild; + enum btrfs_rbio_ops operation; /* first bad stripe */ int faila; @@ -131,6 +148,7 @@ struct btrfs_raid_bio { /* second bad stripe (for raid6 use) */ int failb; + int scrubp; /* * number of pages needed to represent the full * stripe @@ -144,8 +162,13 @@ struct btrfs_raid_bio { */ int bio_list_bytes; + int generic_bio_cnt; + atomic_t refs; + atomic_t stripes_pending; + + atomic_t error; /* * these are two arrays of pointers. We allocate the * rbio big enough to hold them both and setup their @@ -162,6 +185,11 @@ struct btrfs_raid_bio { * here for faster lookup */ struct page **bio_pages; + + /* + * bitmap to record which horizontal stripe has data + */ + unsigned long *dbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check); +static void async_scrub_parity(struct btrfs_raid_bio *rbio); + /* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe @@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, cur->raid_map[0]) return 0; - /* reads can't merge with writes */ - if (last->read_rebuild != - cur->read_rebuild) { + /* we can't merge with different operations */ + if (last->operation != cur->operation) + return 0; + /* + * We've need read the full stripe from the drive. + * check and repair the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + if (last->operation == BTRFS_RBIO_PARITY_SCRUB || + cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - } return 1; } @@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) */ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { - if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; index += ((rbio->nr_data + 1) * rbio->stripe_len) >> @@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); - if (next->read_rebuild) + if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else { + else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); + } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { + steal_rbio(rbio, next); + async_scrub_parity(next); } goto done_nolock; @@ -796,6 +841,21 @@ done_nolock: remove_rbio_from_cache(rbio); } +static inline void +__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) +{ + if (need) { + kfree(raid_map); + kfree(bbio); + } +} + +static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) +{ + __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, + !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); +} + static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) rbio->stripe_pages[i] = NULL; } } - kfree(rbio->raid_map); - kfree(rbio->bbio); + + free_bbio_and_raid_map(rbio); + kfree(rbio); } @@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + free_raid_bio(rbio); while (cur) { @@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err) bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) err = -EIO; rbio_orig_end_io(rbio, err, 0); @@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, { struct btrfs_raid_bio *rbio; int nr_data = 0; - int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int num_pages = rbio_nr_pages(stripe_len, real_stripes); + int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; - rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), GFP_NOFS); - if (!rbio) { - kfree(raid_map); - kfree(bbio); + if (!rbio) return ERR_PTR(-ENOMEM); - } bio_list_init(&rbio->bio_list); INIT_LIST_HEAD(&rbio->plug_list); @@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->real_stripes = real_stripes; + rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; atomic_set(&rbio->refs, 1); + atomic_set(&rbio->error, 0); + atomic_set(&rbio->stripes_pending, 0); /* * the stripe_pages and bio_pages array point to the extra @@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, p = rbio + 1; rbio->stripe_pages = p; rbio->bio_pages = p + sizeof(struct page *) * num_pages; + rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) - nr_data = bbio->num_stripes - 2; + if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + nr_data = real_stripes - 2; else - nr_data = bbio->num_stripes - 1; + nr_data = real_stripes - 1; rbio->nr_data = nr_data; return rbio; @@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) { if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + BUG_ON(rbio->faila == rbio->real_stripes - 1); __raid56_parity_recover(rbio); } else { finish_rmw(rbio); @@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[bbio->num_stripes]; + void *pointers[rbio->real_stripes]; int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; @@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) bio_list_init(&bio_list); - if (bbio->num_stripes - rbio->nr_data == 1) { - p_stripe = bbio->num_stripes - 1; - } else if (bbio->num_stripes - rbio->nr_data == 2) { - p_stripe = bbio->num_stripes - 2; - q_stripe = bbio->num_stripes - 1; + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; } else { BUG(); } @@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * now that we've set rmw_locked, run through the @@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) SetPageUptodate(p); pointers[stripe++] = kmap(p); - raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); } else { /* raid5 */ @@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } - for (stripe = 0; stripe < bbio->num_stripes; stripe++) + for (stripe = 0; stripe < rbio->real_stripes; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); } @@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { @@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } } - atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + if (likely(!bbio->num_tgtdevs)) + goto write_data; + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (!bbio->tgtdev_map[stripe]) + continue; + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, page, + rbio->bbio->tgtdev_map[stripe], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + +write_data: + atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while (1) { bio = bio_list_pop(&bio_list); @@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe = &rbio->bbio->stripes[i]; stripe_start = stripe->physical; if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len) { + physical < stripe_start + rbio->stripe_len && + bio->bi_bdev == stripe->dev->bdev) { return i; } } @@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) if (rbio->faila == -1) { /* first failure on this rbio */ rbio->faila = failed; - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); } else if (rbio->failb == -1) { /* second failure on this rbio */ rbio->failb = failed; - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); } else { ret = -EIO; } @@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err) bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; err = 0; - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) goto cleanup; /* @@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio) static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; - struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); @@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * build a list of bios to read all the missing parts of this * stripe @@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * the bbio may be freed once we submit the last bio. Make sure * not to touch it after that */ - atomic_set(&bbio->stripes_pending, bios_to_read); + atomic_set(&rbio->stripes_pending, bios_to_read); while (1) { bio = bio_list_pop(&bio_list); if (!bio) @@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; + int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) + if (IS_ERR(rbio)) { + __free_bbio_and_raid_map(bbio, raid_map, 1); return PTR_ERR(rbio); + } bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio->operation = BTRFS_RBIO_WRITE; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; /* * don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - return full_stripe_write(rbio); + if (rbio_is_full(rbio)) { + ret = full_stripe_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); + return ret; + } cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, sizeof(*plug)); @@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); + ret = 0; } else { - return __raid56_parity_write(rbio); + ret = __raid56_parity_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); } - return 0; + return ret; } /* @@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + pointers = kzalloc(rbio->real_stripes * sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; @@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) faila = rbio->faila; failb = rbio->failb; - if (rbio->read_rebuild) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock_irq(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); @@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(pagenr, rbio->dbitmap)) + continue; + /* setup our array of pointers with pages * from each stripe */ - for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->read_rebuild && + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->raid_map[rbio->bbio->num_stripes - 1] == + if (rbio->raid_map[rbio->real_stripes - 1] == RAID6_Q_STRIPE) { /* @@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } if (rbio->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->bbio->num_stripes, + raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { - raid6_2data_recov(rbio->bbio->num_stripes, + raid6_2data_recov(rbio->real_stripes, PAGE_SIZE, faila, failb, pointers); } @@ -1850,7 +1968,7 @@ pstripe: * know they can be trusted. If this was a read reconstruction, * other endio functions will fiddle the uptodate bits */ - if (!rbio->read_rebuild) { + if (rbio->operation == BTRFS_RBIO_WRITE) { for (i = 0; i < nr_pages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); @@ -1862,12 +1980,12 @@ pstripe: } } } - for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->read_rebuild && + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1882,9 +2000,9 @@ cleanup: kfree(pointers); cleanup_io: - - if (rbio->read_rebuild) { - if (err == 0) + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + if (err == 0 && + !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -1893,7 +2011,13 @@ cleanup_io: } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; - finish_rmw(rbio); + + if (rbio->operation == BTRFS_RBIO_WRITE) + finish_rmw(rbio); + else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) + finish_parity_scrub(rbio, 0); + else + BUG(); } else { rbio_orig_end_io(rbio, err, 0); } @@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err) set_bio_pages_uptodate(bio); bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) rbio_orig_end_io(rbio, -EIO, 0); else __raid_recover_end_io(rbio); @@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err) static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; - struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); @@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) if (ret) goto cleanup; - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * read everything that hasn't failed. Thanks to the * stripe cache, it is possible that some or all of these * pages are going to be uptodate. */ - for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); continue; } @@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * were up to date, or we might have no bios to read because * the devices were gone. */ - if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { __raid_recover_end_io(rbio); goto out; } else { @@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * the bbio may be freed once we submit the last bio. Make sure * not to touch it after that */ - atomic_set(&bbio->stripes_pending, bios_to_read); + atomic_set(&rbio->stripes_pending, bios_to_read); while (1) { bio = bio_list_pop(&bio_list); if (!bio) @@ -2021,7 +2144,7 @@ out: return 0; cleanup: - if (rbio->read_rebuild) + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) rbio_orig_end_io(rbio, -EIO, 0); return -EIO; } @@ -2034,34 +2157,42 @@ cleanup: */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num) + u64 stripe_len, int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) + if (IS_ERR(rbio)) { + __free_bbio_and_raid_map(bbio, raid_map, generic_io); return PTR_ERR(rbio); + } - rbio->read_rebuild = 1; + rbio->operation = BTRFS_RBIO_READ_REBUILD; bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - kfree(raid_map); - kfree(bbio); + __free_bbio_and_raid_map(bbio, raid_map, generic_io); kfree(rbio); return -EIO; } + if (generic_io) { + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + } else { + set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); + } + /* * reconstruct from the q stripe if they are * asking for mirror 3 */ if (mirror_num == 3) - rbio->failb = bbio->num_stripes - 2; + rbio->failb = rbio->real_stripes - 2; ret = lock_stripe_add(rbio); @@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work) rbio = container_of(work, struct btrfs_raid_bio, work); __raid56_parity_recover(rbio); } + +/* + * The following code is used to scrub/replace the parity stripe + * + * Note: We need make sure all the pages that add into the scrub/replace + * raid bio are correct and not be changed during the scrub/replace. That + * is those pages just hold metadata or file data with checksum. + */ + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) +{ + struct btrfs_raid_bio *rbio; + int i; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) + return NULL; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + rbio->operation = BTRFS_RBIO_PARITY_SCRUB; + + for (i = 0; i < rbio->real_stripes; i++) { + if (bbio->stripes[i].dev == scrub_dev) { + rbio->scrubp = i; + break; + } + } + + /* Now we just support the sectorsize equals to page size */ + ASSERT(root->sectorsize == PAGE_SIZE); + ASSERT(rbio->stripe_npages == stripe_nsectors); + bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + + return rbio; +} + +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical) +{ + int stripe_offset; + int index; + + ASSERT(logical >= rbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + + rbio->stripe_len * rbio->nr_data); + stripe_offset = (int)(logical - rbio->raid_map[0]); + index = stripe_offset >> PAGE_CACHE_SHIFT; + rbio->bio_pages[index] = page; +} + +/* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. + */ +static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) +{ + int i; + int bit; + int index; + struct page *page; + + for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { + for (i = 0; i < rbio->real_stripes; i++) { + index = i * rbio->stripe_npages + bit; + if (rbio->stripe_pages[index]) + continue; + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; + ClearPageUptodate(page); + } + } + return 0; +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_parity_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + + if (atomic_read(&rbio->error)) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); +} + +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[rbio->real_stripes]; + DECLARE_BITMAP(pbitmap, rbio->stripe_npages); + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct page *p_page = NULL; + struct page *q_page = NULL; + struct bio_list bio_list; + struct bio *bio; + int is_replace = 0; + int ret; + + bio_list_init(&bio_list); + + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; + } else { + BUG(); + } + + if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + is_replace = 1; + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + } + + /* + * Because the higher layers(scrubber) are unlikely to + * use this area of the disk again soon, so don't cache + * it. + */ + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + if (!need_check) + goto writeback; + + p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!p_page) + goto cleanup; + SetPageUptodate(p_page); + + if (q_stripe != -1) { + q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!q_page) { + __free_page(p_page); + goto cleanup; + } + SetPageUptodate(q_page); + } + + atomic_set(&rbio->error, 0); + + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *p; + void *parity; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + pointers[stripe++] = kmap(p_page); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + pointers[stripe++] = kmap(q_page); + + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + /* Check scrubbing pairty and repair it */ + p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + parity = kmap(p); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + else + /* Parity is right, needn't writeback */ + bitmap_clear(rbio->dbitmap, pagenr, 1); + kunmap(p); + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + __free_page(p_page); + if (q_page) + __free_page(q_page); + +writeback: + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, + page, rbio->scrubp, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + + if (!is_replace) + goto submit_write; + + for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, page, + bbio->tgtdev_map[rbio->scrubp], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + +submit_write: + nr_data = bio_list_size(&bio_list); + if (!nr_data) { + /* Every parity is right */ + rbio_orig_end_io(rbio, 0, 0); + return; + } + + atomic_set(&rbio->stripes_pending, nr_data); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_parity_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) +{ + if (stripe >= 0 && stripe < rbio->nr_data) + return 1; + return 0; +} + +/* + * While we're doing the parity check and repair, we could have errors + * in reading pages off the disk. This checks for errors and if we're + * not able to read the page it'll trigger parity reconstruction. The + * parity scrub will be finished after we've reconstructed the failed + * stripes + */ +static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +{ + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + goto cleanup; + + if (rbio->faila >= 0 || rbio->failb >= 0) { + int dfail = 0, failp = -1; + + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; + + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; + + /* + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) + */ + if (dfail > rbio->bbio->max_errors - 1) + goto cleanup; + + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) { + finish_parity_scrub(rbio, 0); + return; + } + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckly, use the other one to repair + * the data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) + goto cleanup; + + __raid_recover_end_io(rbio); + } else { + finish_parity_scrub(rbio, 1); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_parity_scrub_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_parity_scrub(rbio); +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + int pagenr; + int stripe; + struct bio *bio; + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + bio_list_init(&bio_list); + + atomic_set(&rbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid56_parity_scrub_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return; + +finish: + validate_rbio_for_parity_scrub(rbio); +} + +static void scrub_parity_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_parity_scrub_stripe(rbio); +} + +static void async_scrub_parity(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + scrub_parity_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_scrub_parity(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index ea5d73bfdfbe..31d4a157b5e3 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) +struct btrfs_raid_bio; +struct btrfs_device; + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num); + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, u64 stripe_len); +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical); +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efa083113827..f2bb13a23f86 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -63,10 +63,18 @@ struct scrub_ctx; */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ +struct scrub_recover { + atomic_t refs; + struct btrfs_bio *bbio; + u64 *raid_map; + u64 map_length; +}; + struct scrub_page { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; + struct list_head list; u64 flags; /* extent flags */ u64 generation; u64 logical; @@ -79,6 +87,8 @@ struct scrub_page { unsigned int io_error:1; }; u8 csum[BTRFS_CSUM_SIZE]; + + struct scrub_recover *recover; }; struct scrub_bio { @@ -105,14 +115,52 @@ struct scrub_block { atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ struct scrub_ctx *sctx; + struct scrub_parity *sparity; struct { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; unsigned int generation_error:1; /* also sets header_error */ + + /* The following is for the data used to check parity */ + /* It is for the data with checksum */ + unsigned int data_corrected:1; }; }; +/* Used for the chunks with parity stripe such RAID5/6 */ +struct scrub_parity { + struct scrub_ctx *sctx; + + struct btrfs_device *scrub_dev; + + u64 logic_start; + + u64 logic_end; + + int nsectors; + + int stripe_len; + + atomic_t ref_count; + + struct list_head spages; + + /* Work of parity check and repair */ + struct btrfs_work work; + + /* Mark the parity blocks which have data */ + unsigned long *dbitmap; + + /* + * Mark the parity blocks which have data, but errors happen when + * read data or check data + */ + unsigned long *ebitmap; + + unsigned long bitmap[0]; +}; + struct scrub_wr_ctx { struct scrub_bio *wr_curr_bio; struct btrfs_device *tgtdev; @@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, - u16 csum_size); + u16 csum_size, int retry_failed_mirror); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); +static void scrub_parity_get(struct scrub_parity *sparity); +static void scrub_parity_put(struct scrub_parity *sparity); static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -790,6 +840,20 @@ out: scrub_pending_trans_workers_dec(sctx); } +static inline void scrub_get_recover(struct scrub_recover *recover) +{ + atomic_inc(&recover->refs); +} + +static inline void scrub_put_recover(struct scrub_recover *recover) +{ + if (atomic_dec_and_test(&recover->refs)) { + kfree(recover->bbio); + kfree(recover->raid_map); + kfree(recover); + } +} + /* * scrub_handle_errored_block gets called when either verification of the * pages failed or the bio failed to read, e.g. with EIO. In the latter @@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* build and submit the bios for the failed mirror, check checksums */ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size); + csum, generation, sctx->csum_size, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ spin_lock(&sctx->stat_lock); sctx->stat.unverified_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); if (sctx->is_dev_replace) @@ -1019,7 +1084,7 @@ nodatasum_case: /* build and submit the bios, check checksums */ scrub_recheck_block(fs_info, sblock_other, is_metadata, have_csum, csum, generation, - sctx->csum_size); + sctx->csum_size, 0); if (!sblock_other->header_error && !sblock_other->checksum_error && @@ -1169,7 +1234,7 @@ nodatasum_case: */ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, csum, - generation, sctx->csum_size); + generation, sctx->csum_size, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -1180,6 +1245,7 @@ nodatasum_case: corrected_error: spin_lock(&sctx->stat_lock); sctx->stat.corrected_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "BTRFS: fixed up error at logical %llu on dev %s\n", @@ -1201,11 +1267,18 @@ out: mirror_index++) { struct scrub_block *sblock = sblocks_for_recheck + mirror_index; + struct scrub_recover *recover; int page_index; for (page_index = 0; page_index < sblock->page_count; page_index++) { sblock->pagev[page_index]->sblock = NULL; + recover = sblock->pagev[page_index]->recover; + if (recover) { + scrub_put_recover(recover); + sblock->pagev[page_index]->recover = + NULL; + } scrub_page_put(sblock->pagev[page_index]); } } @@ -1215,14 +1288,63 @@ out: return 0; } +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) +{ + if (raid_map) { + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + return 3; + else + return 2; + } else { + return (int)bbio->num_stripes; + } +} + +static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, + u64 mapped_length, + int nstripes, int mirror, + int *stripe_index, + u64 *stripe_offset) +{ + int i; + + if (raid_map) { + /* RAID5/6 */ + for (i = 0; i < nstripes; i++) { + if (raid_map[i] == RAID6_Q_STRIPE || + raid_map[i] == RAID5_P_STRIPE) + continue; + + if (logical >= raid_map[i] && + logical < raid_map[i] + mapped_length) + break; + } + + *stripe_index = i; + *stripe_offset = logical - raid_map[i]; + } else { + /* The other RAID type */ + *stripe_index = mirror; + *stripe_offset = 0; + } +} + static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, struct scrub_block *original_sblock, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { + struct scrub_recover *recover; + struct btrfs_bio *bbio; + u64 *raid_map; + u64 sublen; + u64 mapped_length; + u64 stripe_offset; + int stripe_index; int page_index; int mirror_index; + int nmirrors; int ret; /* @@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, page_index = 0; while (length > 0) { - u64 sublen = min_t(u64, length, PAGE_SIZE); - u64 mapped_length = sublen; - struct btrfs_bio *bbio = NULL; + sublen = min_t(u64, length, PAGE_SIZE); + mapped_length = sublen; + bbio = NULL; + raid_map = NULL; /* * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0); + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0, &raid_map); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); + kfree(raid_map); return -EIO; } + recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + if (!recover) { + kfree(bbio); + kfree(raid_map); + return -ENOMEM; + } + + atomic_set(&recover->refs, 1); + recover->bbio = bbio; + recover->raid_map = raid_map; + recover->map_length = mapped_length; + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + + nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); + for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; struct scrub_page *page; @@ -1265,26 +1403,38 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - kfree(bbio); + scrub_put_recover(recover); return -ENOMEM; } scrub_page_get(page); sblock->pagev[page_index] = page; page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; + + scrub_stripe_index_and_offset(logical, raid_map, + mapped_length, + bbio->num_stripes, + mirror_index, + &stripe_index, + &stripe_offset); + page->physical = bbio->stripes[stripe_index].physical + + stripe_offset; + page->dev = bbio->stripes[stripe_index].dev; + BUG_ON(page_index >= original_sblock->page_count); page->physical_for_dev_replace = original_sblock->pagev[page_index]-> physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ - page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; sblock->page_count++; page->page = alloc_page(GFP_NOFS); if (!page->page) goto leave_nomem; + + scrub_get_recover(recover); + page->recover = recover; } - kfree(bbio); + scrub_put_recover(recover); length -= sublen; logical += sublen; page_index++; @@ -1293,6 +1443,51 @@ leave_nomem: return 0; } +struct scrub_bio_ret { + struct completion event; + int error; +}; + +static void scrub_bio_wait_endio(struct bio *bio, int error) +{ + struct scrub_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +static inline int scrub_is_page_on_raid56(struct scrub_page *page) +{ + return page->recover && page->recover->raid_map; +} + +static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, + struct bio *bio, + struct scrub_page *page) +{ + struct scrub_bio_ret done; + int ret; + + init_completion(&done.event); + done.error = 0; + bio->bi_iter.bi_sector = page->logical >> 9; + bio->bi_private = &done; + bio->bi_end_io = scrub_bio_wait_endio; + + ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, + page->recover->raid_map, + page->recover->map_length, + page->mirror_num, 0); + if (ret) + return ret; + + wait_for_completion(&done.event); + if (done.error) + return -EIO; + + return 0; +} + /* * this function will check the on disk data for checksum errors, header * errors and read I/O errors. If any I/O errors happen, the exact pages @@ -1303,7 +1498,7 @@ leave_nomem: static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, - u16 csum_size) + u16 csum_size, int retry_failed_mirror) { int page_num; @@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (btrfsic_submit_bio_wait(READ, bio)) - sblock->no_io_error_seen = 0; + if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + sblock->no_io_error_seen = 0; + } else { + bio->bi_iter.bi_sector = page->physical >> 9; + + if (btrfsic_submit_bio_wait(READ, bio)) + sblock->no_io_error_seen = 0; + } bio_put(bio); } @@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { int page_num; + /* + * This block is used for the check of the parity on the source device, + * so the data needn't be written into the destination device. + */ + if (sblock->sparity) + return; + for (page_num = 0; page_num < sblock->page_count; page_num++) { int ret; @@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock) if (atomic_dec_and_test(&sblock->ref_count)) { int i; + if (sblock->sparity) + scrub_parity_put(sblock->sparity); + for (i = 0; i < sblock->page_count; i++) scrub_page_put(sblock->pagev[i]); kfree(sblock); @@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) scrub_pending_bio_dec(sctx); } +static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, + unsigned long *bitmap, + u64 start, u64 len) +{ + int offset; + int nsectors; + int sectorsize = sparity->sctx->dev_root->sectorsize; + + if (len >= sparity->stripe_len) { + bitmap_set(bitmap, 0, sparity->nsectors); + return; + } + + start -= sparity->logic_start; + offset = (int)do_div(start, sparity->stripe_len); + offset /= sectorsize; + nsectors = (int)len / sectorsize; + + if (offset + nsectors <= sparity->nsectors) { + bitmap_set(bitmap, offset, nsectors); + return; + } + + bitmap_set(bitmap, offset, sparity->nsectors - offset); + bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); +} + +static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); +} + +static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); +} + static void scrub_block_complete(struct scrub_block *sblock) { + int corrupted = 0; + if (!sblock->no_io_error_seen) { + corrupted = 1; scrub_handle_errored_block(sblock); } else { /* @@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock) * dev replace case, otherwise write here in dev replace * case. */ - if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + corrupted = scrub_checksum(sblock); + if (!corrupted && sblock->sctx->is_dev_replace) scrub_write_block_to_dev_replace(sblock); } + + if (sblock->sparity && corrupted && !sblock->data_corrected) { + u64 start = sblock->pagev[0]->logical; + u64 end = sblock->pagev[sblock->page_count - 1]->logical + + PAGE_SIZE; + + scrub_parity_mark_sectors_error(sblock->sparity, + start, end - start); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -2228,6 +2491,132 @@ behind_scrub_pages: return 0; } +static int scrub_pages_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num, u8 *csum) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + sblock->sparity = sparity; + scrub_parity_get(sparity); + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + /* For scrub block */ + scrub_page_get(spage); + sblock->pagev[index] = spage; + /* For scrub parity */ + scrub_page_get(spage); + list_add_tail(&spage->list, &sparity->spages); + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static int scrub_extent_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num) +{ + struct scrub_ctx *sctx = sparity->sctx; + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + blocksize = sctx->nodesize; + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + goto skip; + } + ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + flags, gen, mirror_num, + have_csum ? csum : NULL); +skip: + if (ret) + return ret; + len -= l; + logical += l; + physical += l; + } + return 0; +} + /* * Given a physical address, this will calculate it's * logical offset. if this is a parity stripe, it will return @@ -2236,7 +2625,8 @@ behind_scrub_pages: * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset) + struct map_lookup *map, u64 *offset, + u64 *stripe_start) { int i; int j = 0; @@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num, last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); + if (stripe_start) + *stripe_start = last_offset; + *offset = last_offset; for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; @@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num, return 1; } +static void scrub_free_parity(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_page *curr, *next; + int nbits; + + nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + if (nbits) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors += nbits; + sctx->stat.uncorrectable_errors += nbits; + spin_unlock(&sctx->stat_lock); + } + + list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_del_init(&curr->list); + scrub_page_put(curr); + } + + kfree(sparity); +} + +static void scrub_parity_bio_endio(struct bio *bio, int error) +{ + struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_ctx *sctx = sparity->sctx; + + if (error) + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); + bio_put(bio); +} + +static void scrub_parity_check_and_repair(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct bio *bio; + struct btrfs_raid_bio *rbio; + struct scrub_page *spage; + struct btrfs_bio *bbio = NULL; + u64 *raid_map = NULL; + u64 length; + int ret; + + if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, + sparity->nsectors)) + goto out; + + length = sparity->logic_end - sparity->logic_start + 1; + ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, + sparity->logic_start, + &length, &bbio, 0, &raid_map); + if (ret || !bbio || !raid_map) + goto bbio_out; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = sparity->logic_start >> 9; + bio->bi_private = sparity; + bio->bi_end_io = scrub_parity_bio_endio; + + rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, + raid_map, length, + sparity->scrub_dev, + sparity->dbitmap, + sparity->nsectors); + if (!rbio) + goto rbio_out; + + list_for_each_entry(spage, &sparity->spages, list) + raid56_parity_add_scrub_pages(rbio, spage->page, + spage->logical); + + scrub_pending_bio_inc(sctx); + raid56_parity_submit_scrub_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + kfree(bbio); + kfree(raid_map); + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +out: + scrub_free_parity(sparity); +} + +static inline int scrub_calc_parity_bitmap_len(int nsectors) +{ + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); +} + +static void scrub_parity_get(struct scrub_parity *sparity) +{ + atomic_inc(&sparity->ref_count); +} + +static void scrub_parity_put(struct scrub_parity *sparity) +{ + if (!atomic_dec_and_test(&sparity->ref_count)) + return; + + scrub_parity_check_and_repair(sparity); +} + +static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logic_start, + u64 logic_end) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + u64 flags; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + u64 generation; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + struct scrub_parity *sparity; + int nsectors; + int bitmap_len; + int extent_mirror_num; + int stop_loop = 0; + + nsectors = map->stripe_len / root->sectorsize; + bitmap_len = scrub_calc_parity_bitmap_len(nsectors); + sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, + GFP_NOFS); + if (!sparity) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + sparity->stripe_len = map->stripe_len; + sparity->nsectors = nsectors; + sparity->sctx = sctx; + sparity->scrub_dev = sdev; + sparity->logic_start = logic_start; + sparity->logic_end = logic_end; + atomic_set(&sparity->ref_count, 1); + INIT_LIST_HEAD(&sparity->spages); + sparity->dbitmap = sparity->bitmap; + sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + + ret = 0; + while (logic_start < logic_end) { + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logic_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->nodesize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logic_start) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid > logic_end) { + stop_loop = 1; + break; + } + + while (key.objectid >= logic_start + map->stripe_len) + logic_start += map->stripe_len; + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logic_start && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); + goto next; + } +again: + extent_logical = key.objectid; + extent_len = bytes; + + if (extent_logical < logic_start) { + extent_len -= logic_start - extent_logical; + extent_logical = logic_start; + } + + if (extent_logical + extent_len > + logic_start + map->stripe_len) + extent_len = logic_start + map->stripe_len - + extent_logical; + + scrub_parity_mark_sectors_data(sparity, extent_logical, + extent_len); + + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent_for_parity(sparity, extent_logical, + extent_len, + extent_physical, + extent_dev, flags, + generation, + extent_mirror_num); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + logic_start += map->stripe_len; + + if (logic_start >= logic_end) { + stop_loop = 1; + break; + } + + if (logic_start < key.objectid + bytes) { + cond_resched(); + goto again; + } + } +next: + path->slots[0]++; + } + + btrfs_release_path(path); + + if (stop_loop) + break; + + logic_start += map->stripe_len; + } +out: + if (ret < 0) + scrub_parity_mark_sectors_error(sparity, logic_start, + logic_end - logic_start + 1); + scrub_parity_put(sparity); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + btrfs_release_path(path); + return ret < 0 ? ret : 0; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, int num, u64 base, u64 length, int is_dev_replace) { - struct btrfs_path *path; + struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; @@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 stripe_logical; + u64 stripe_end; struct btrfs_device *extent_dev; int extent_mirror_num; int stop_loop = 0; @@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, mirror_num = num % map->num_stripes + 1; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { - get_raid56_logic_offset(physical, num, map, &offset); + get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; } else { @@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; + ppath = btrfs_alloc_path(); + if (!ppath) { + btrfs_free_path(ppath); + return -ENOMEM; + } + /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { get_raid56_logic_offset(physical_end, num, - map, &logic_end); + map, &logic_end, NULL); logic_end += base; } else { logic_end = logical + increment * nstripes; @@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { ret = get_raid56_logic_offset(physical, num, - map, &logical); + map, &logical, &stripe_logical); logical += base; - if (ret) + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment - 1; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; goto skip; + } } /* * canceled? @@ -2558,13 +3284,25 @@ again: * loop until we find next data stripe * or we have finished all stripes. */ - do { - physical += map->stripe_len; - ret = get_raid56_logic_offset( - physical, num, - map, &logical); - logical += base; - } while (physical < physical_end && ret); +loop: + physical += map->stripe_len; + ret = get_raid56_logic_offset(physical, + num, map, &logical, + &stripe_logical); + logical += base; + + if (ret && physical < physical_end) { + stripe_logical += base; + stripe_end = stripe_logical + + increment - 1; + ret = scrub_raid56_parity(sctx, + map, scrub_dev, ppath, + stripe_logical, + stripe_end); + if (ret) + goto out; + goto loop; + } } else { physical += map->stripe_len; logical += increment; @@ -2605,6 +3343,7 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); + btrfs_free_path(ppath); return ret < 0 ? ret : 0; } @@ -3310,6 +4049,50 @@ out: scrub_pending_trans_workers_dec(sctx); } +static int check_extent_to_block(struct inode *inode, u64 start, u64 len, + u64 logical) +{ + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree; + struct extent_map *em; + u64 lockstart = start, lockend = start + len - 1; + int ret = 0; + + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = 1; + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > logical || + em->block_start + em->block_len < logical + len) { + free_extent_map(em); + ret = 1; + goto out_unlock; + } + free_extent_map(em); + +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); + return ret; +} + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct scrub_copy_nocow_ctx *nocow_ctx) { @@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct inode *inode; struct page *page; struct btrfs_root *local_root; - struct btrfs_ordered_extent *ordered; - struct extent_map *em; - struct extent_state *cached_state = NULL; struct extent_io_tree *io_tree; u64 physical_for_dev_replace; + u64 nocow_ctx_logical; u64 len = nocow_ctx->len; - u64 lockstart = offset, lockend = offset + len - 1; unsigned long index; int srcu_index; int ret = 0; @@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; io_tree = &BTRFS_I(inode)->io_tree; + nocow_ctx_logical = nocow_ctx->logical; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, lockstart, len); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock; - } - - /* - * This extent does not actually cover the logical extent anymore, - * move on to the next inode. - */ - if (em->block_start > nocow_ctx->logical || - em->block_start + em->block_len < nocow_ctx->logical + len) { - free_extent_map(em); - goto out_unlock; + ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto out; } - free_extent_map(em); while (len >= PAGE_CACHE_SIZE) { index = offset >> PAGE_CACHE_SHIFT; @@ -3396,7 +4159,7 @@ again: goto next_page; } else { ClearPageError(page); - err = extent_read_full_page_nolock(io_tree, page, + err = extent_read_full_page(io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); if (err) { @@ -3421,6 +4184,14 @@ again: goto next_page; } } + + ret = check_extent_to_block(inode, offset, len, + nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto next_page; + } + err = write_page_nocow(nocow_ctx->sctx, physical_for_dev_replace, page); if (err) @@ -3434,12 +4205,10 @@ next_page: offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; + nocow_ctx_logical += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } ret = COPY_COMPLETE; -out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); out: mutex_unlock(&inode->i_mutex); iput(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828dd0a86..804432dbc351 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5507,6 +5507,51 @@ out: return ret; } +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx->send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx->send_root); +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 54bd91ece35b..60f7cbe815e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used) { + if (!trans->blocks_used && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "disabling disk space caching"); break; case Opt_inode_cache: - btrfs_set_and_info(root, CHANGE_INODE_CACHE, + btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; case Opt_noinode_cache: - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, "disabling inode map caching"); break; case Opt_clear_cache: @@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (fs_info->pending_changes == 0) + return 0; + trans = btrfs_start_transaction(root, 0); + } else { + return PTR_ERR(trans); + } } return btrfs_commit_transaction(trans, root); } @@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; + /* + * We aren't under the device list lock, so this is racey-ish, but good + * enough for our purposes. + */ nr_devices = fs_info->fs_devices->open_devices; - BUG_ON(!nr_devices); + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_NOFS); @@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (fs_info->alloc_start) + mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || device->is_tgtdev_for_dev_replace) continue; + if (i >= nr_devices) + break; + avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ @@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) skip_space = 1024 * 1024; /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) + if (fs_info->alloc_start && + fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) { + rcu_read_unlock(); skip_space = max(fs_info->alloc_start, skip_space); - /* - * btrfs can not use the free space in [0, skip_space - 1], - * we must subtract it from the total. In order to implement - * it, we account the used space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - return ret; - } + /* + * btrfs can not use the free space in + * [0, skip_space - 1], we must subtract it from the + * total. In order to implement it, we account the used + * space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, + skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; + } - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; + rcu_read_lock(); + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + } /* * we can use the free space in [0, skip_space - 1], subtract @@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i++; } + rcu_read_unlock(); + if (fs_info->alloc_start) + mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * holding chunk_muext to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree -= block_rsv->size >> bits; spin_unlock(&block_rsv->lock); - buf->f_bavail = total_free_data; + buf->f_bavail = div_u64(total_free_data, factor); ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (ret) return ret; - } buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b2e7bb4393f6..92db3f648df4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); - struct btrfs_trans_handle *trans; u64 features, set, clear; unsigned long val; int ret; @@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); - trans = btrfs_start_transaction(fs_info->fs_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) @@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); - ret = btrfs_commit_transaction(trans, fs_info->fs_root); - if (ret) - return ret; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); return count; } @@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = fs_info->fs_root; - int ret; size_t p_len; if (fs_info->sb->s_flags & MS_RDONLY) @@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - spin_lock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); - spin_unlock(&root->fs_info->super_lock); - ret = btrfs_commit_transaction(trans, root); + spin_unlock(&fs_info->super_lock); - if (!ret) - return len; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); - return ret; + return len; } BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..a605d4e2f2bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } + spin_unlock(&tree->lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root->commit_root = btrfs_root_node(root); if (is_fstree(root->objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); } up_write(&fs_info->commit_root_sem); } @@ -220,6 +247,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -488,6 +516,7 @@ again: h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; return werr; } @@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { - clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } @@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return 0; } -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, &trans->transaction->dirty_pages, EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; } /* @@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we @@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } /* - * Since the transaction is done, we should set the inode map cache flag - * before any other comming transaction. + * Since the transaction is done, we can apply the pending changes + * before the next transaction. */ - if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) - btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - else - btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + btrfs_apply_pending_changes(root->fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) return (ret < 0) ? 0 : 1; } + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = cmpxchg(&fs_info->pending_changes, 0, 0); + if (!prev) + return; + + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_COMMIT; + if (prev & bit) + btrfs_debug(fs_info, "pending commit done"); + prev &= ~bit; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..00ed29c4b3f9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -56,6 +56,7 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head pending_chunks; + struct list_head pending_ordered; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; @@ -105,6 +106,7 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; + struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); @@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); + #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 286213cec861..9a02da16f2be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, + mark); + btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = root_log_ctx.log_ret; + if (!ret) + ret = root_log_ctx.log_ret; goto out; } ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); @@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_NEW | EXTENT_DIRTY); - btrfs_wait_logged_extents(log, log_transid); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + if (!ret) + ret = btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + /* + * Clear the AS_EIO/AS_ENOSPC flags from the inode's + * i_mapping flags, so that the next fsync won't get + * an outdated io error too. + */ + btrfs_inode_check_errors(inode); *ordered_io_error = true; break; } @@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) btrfs_set_token_file_extent_type(leaf, fi, @@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list); + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate @@ -4089,6 +4104,21 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { + /* + * Some ordered extents started by fsync might have completed + * before we collected the ordered extents in logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + err = btrfs_inode_check_errors(inode); + if (err) { + ctx->io_err = err; + goto out_unlock; + } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d47289c715c8..0144790e296e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; @@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, u64 *start, u64 len) { struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; - list_for_each_entry(em, &trans->transaction->pending_chunks, list) { +again: + list_for_each_entry(em, search_list, list) { struct map_lookup *map; int i; @@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, ret = 1; } } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } return ret; } @@ -1800,8 +1796,8 @@ error_undo: goto error_brelse; } -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; @@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; call_rcu(&srcdev->rcu, free_device); @@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for the tree */ - free_extent_map(em); out: /* once for us */ free_extent_map(em); @@ -4505,6 +4501,8 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); + /* One for the pending_chunks list reference */ + free_extent_map(em); error: kfree(devices_info); return ret; @@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b) static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) { struct btrfs_bio_stripe s; + int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; int i; u64 l; int again = 1; + int m; while (again) { again = 0; - for (i = 0; i < bbio->num_stripes - 1; i++) { + for (i = 0; i < real_stripes - 1; i++) { if (parity_smaller(raid_map[i], raid_map[i+1])) { s = bbio->stripes[i]; l = raid_map[i]; @@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) raid_map[i] = raid_map[i+1]; bbio->stripes[i+1] = s; raid_map[i+1] = l; + + if (bbio->tgtdev_map) { + m = bbio->tgtdev_map[i]; + bbio->tgtdev_map[i] = + bbio->tgtdev_map[i + 1]; + bbio->tgtdev_map[i + 1] = m; + } + again = 1; } } @@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int ret = 0; int num_stripes; int max_errors = 0; + int tgtdev_indexes = 0; struct btrfs_bio *bbio = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; @@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, BTRFS_BLOCK_GROUP_RAID6)) { u64 tmp; - if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) - && raid_map_ret) { + if (raid_map_ret && + ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { int i, rot; /* push stripe_nr back to the start of the full stripe */ stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len); - - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + do_div(stripe_nr, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_alloc_stripes <<= 1; if (rw & REQ_GET_READ_MIRRORS) num_alloc_stripes++; + tgtdev_indexes = num_stripes; } - bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); + + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), + GFP_NOFS); if (!bbio) { kfree(raid_map); ret = -ENOMEM; goto out; } atomic_set(&bbio->error, 0); + if (dev_replace_is_ongoing) + bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); if (rw & REQ_DISCARD) { int factor = 0; @@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); + tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { int index_where_to_add; @@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, new->physical = old->physical; new->length = old->length; new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; index_where_to_add++; max_errors++; + tgtdev_indexes++; } } num_stripes = index_where_to_add; @@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, tgtdev_stripe->length = bbio->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_indexes++; num_stripes++; } } @@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; + bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, mirror_num, NULL); } +/* For Scrub/replace */ +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + u64 **raid_map_ret) +{ + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + mirror_num, raid_map_ret); +} + int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) @@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } else { ret = raid56_parity_recover(root, bio, bbio, raid_map, map_length, - mirror_num); + mirror_num, 1); } - /* - * FIXME, replace dosen't support raid56 yet, please fix - * it in the future. - */ + btrfs_bio_counter_dec(root->fs_info); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 08980fa23039..d6fe73c0f4a2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -292,7 +292,7 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 +#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) struct btrfs_bio { atomic_t stripes_pending; @@ -305,6 +305,8 @@ struct btrfs_bio { int max_errors; int num_stripes; int mirror_num; + int num_tgtdevs; + int *tgtdev_map; struct btrfs_bio_stripe stripes[]; }; @@ -387,12 +389,18 @@ struct btrfs_balance_control { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) +#define btrfs_bio_size(total_stripes, real_stripes) \ + (sizeof(struct btrfs_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ + (sizeof(int) * (real_stripes))) int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + u64 **raid_map_ret); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, @@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index dcf20131fbe4..47b19465f0dc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,6 +29,7 @@ #include "xattr.h" #include "disk-io.h" #include "props.h" +#include "locking.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct btrfs_dir_item *di; + struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); @@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->skip_release_on_error = 1; + + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ if (flags & XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { + ASSERT(mutex_is_locked(&inode->i_mutex)); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (!di) { ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; btrfs_release_path(path); + di = NULL; + } + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EOVERFLOW) { /* - * remove the attribute + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. */ - if (!value) - goto out; - } else { - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), - name, name_len, 0); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + ret = 0; + btrfs_assert_tree_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; goto out; } - if (!di && !value) - goto out; - btrfs_release_path(path); + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(root, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; } -again: - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - /* - * If we're setting an xattr to a new value but the new value is say - * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting - * back from split_leaf. This is because it thinks we'll be extending - * the existing item size, but we're asking for enough space to add the - * item itself. So if we get EOVERFLOW just set ret to EEXIST and let - * the rest of the function figure it out. - */ - if (ret == -EOVERFLOW) + if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; + goto out; + } - if (ret == -EEXIST) { - if (flags & XATTR_CREATE) - goto out; + if (di) { /* - * We can't use the path we already have since we won't have the - * proper locking for a delete, so release the path and - * re-lookup to delete the thing. + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - /* Shouldn't happen but just in case... */ - btrfs_release_path(path); - goto again; + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(root, leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(root, path, + size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(root, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(root, path, data_size); + } + item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, item) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(leaf); + } else { /* - * We have a value to set, so go back and try to insert it now. + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. */ - if (value) { - btrfs_release_path(path); - goto again; - } } out: btrfs_free_path(path); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index db3f772e57ae..a75fba67bb1f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -158,17 +158,8 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of - * ext4_map_blocks wants to know whether or not the underlying cluster has - * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that - * the requested mapping was from previously mapped (or delayed allocated) - * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster - * should never appear on buffer_head's state flags. - */ -#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -565,10 +556,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 - /* Do not put hole in extent cache */ -#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 /* * The bit position of these flags must not overlap with any of the @@ -889,10 +878,12 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; - struct list_head i_es_lru; + struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ - unsigned int i_es_lru_nr; /* protected by i_es_lock */ - unsigned long i_touch_when; /* jiffies of last accessing */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -1337,10 +1328,11 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; - struct list_head s_es_lru; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; - spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; @@ -2196,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); -extern void ext4_kvfree(void *ptr); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2647,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, int *retval); extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline); + int *has_inline, __u64 start, __u64 len); extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); @@ -2795,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* - * Note that these flags will never ever appear in a buffer_head's state flag. - * See EXT4_MAP_... to see where this is used. - */ -enum ext4_state_bits { - BH_AllocFromCluster /* allocated blocks were part of already - * allocated cluster. */ - = BH_JBDPrivateStart -}; - -/* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0b16fb4c06d3..e5d3eadf47b1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len = 0; - ext4_lblk_t lblock = 0; + ext4_lblk_t len; + ext4_lblk_t lblock; struct ext4_extent *ex; + struct extent_status es; ex = path[depth].p_ext; if (ex == NULL) { - /* - * there is no extent yet, so gap is [0;-] and we - * don't cache it - */ + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else { BUG(); } - ext_debug(" -> %u:%lu\n", lblock, len); + ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + if (es.es_len) { + /* There's delayed extent containing lblock? */ + if (es.es_lblk <= lblock) + return; + len = min(es.es_lblk - lblock, len); + } + ext_debug(" -> %u:%u\n", lblock, len); + ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } /* @@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = ext4_ext_get_actual_len(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; int flags = get_default_free_blocks_flags(inode); @@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * at the beginning of the extent. Instead, we make a note * that we tried freeing the cluster, and check to see if we * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of the ext4_truncate() operation. + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; @@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if ((*partial_cluster > 0) && - (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if (*partial_cluster > 0 && + *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - unsigned int unaligned; + long long first_cluster; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; @@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * used by any other extent (partial_cluster is negative). */ if (*partial_cluster < 0 && - -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; ext_debug("free last %u blocks starting %llu partial %lld\n", @@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * beginning of a cluster, and we removed the entire * extent and the cluster is not used by any other extent, * save the partial cluster here, since we might need to - * delete if we determine that the truncate operation has - * removed all of the blocks in the cluster. + * delete if we determine that the truncate or punch hole + * operation has removed all of the blocks in the cluster. + * If that cluster is used by another extent, preserve its + * negative value so it isn't freed later on. * - * On the other hand, if we did not manage to free the whole - * extent, we have to mark the cluster as used (store negative - * cluster number in partial_cluster). + * If the whole extent wasn't freed, we've reached the + * start of the truncated/punched region and have finished + * removing blocks. If there's a partial cluster here it's + * shared with the remainder of the extent and is no longer + * a candidate for removal. */ - unaligned = EXT4_PBLK_COFF(sbi, pblk); - if (unaligned && (ee_len == num) && - (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) - *partial_cluster = EXT4_B2C(sbi, pblk); - else if (unaligned) - *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); - else if (*partial_cluster > 0) + if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { + first_cluster = (long long) EXT4_B2C(sbi, pblk); + if (first_cluster != -*partial_cluster) + *partial_cluster = first_cluster; + } else { *partial_cluster = 0; + } } else ext4_error(sbi->s_sb, "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, /* * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents - * has been released from it. It gets negative in case - * that the cluster is still used. + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ @@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - /* - * If we're starting with an extent other than the last one in the - * node, we need to see if it shares a cluster with the extent to - * the right (towards the end of the file). If its leftmost cluster - * is this extent's rightmost cluster and it is not cluster aligned, - * we'll mark it as a partial that is not to be deallocated. - */ - - if (ex != EXT_LAST_EXTENT(eh)) { - ext4_fsblk_t current_pblk, right_pblk; - long long current_cluster, right_cluster; - - current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - current_cluster = (long long)EXT4_B2C(sbi, current_pblk); - right_pblk = ext4_ext_pblock(ex + 1); - right_cluster = (long long)EXT4_B2C(sbi, right_pblk); - if (current_cluster == right_cluster && - EXT4_PBLK_COFF(sbi, right_pblk)) - *partial_cluster = -right_cluster; - } - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && @@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, - * so if this extent is not cluster aligned we have - * to mark the current cluster as used to avoid - * accidentally freeing it later on + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. */ - pblk = ext4_ext_pblock(ex); - if (EXT4_PBLK_COFF(sbi, pblk)) + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); *partial_cluster = - -((long long)EXT4_B2C(sbi, pblk)); + -(long long) EXT4_B2C(sbi, pblk); + } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else if (*partial_cluster > 0) - *partial_cluster = 0; + } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the - * current extent. If there's a partial cluster and no extents - * remain in the leaf, it can't be freed here. It can only be - * freed when it's possible to determine if it's not shared with - * any other extent - when the next leaf is processed or when space - * removal is complete. + * current extent. If it is shared with the current extent + * we zero partial_cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && eh->eh_entries && - (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != - *partial_cluster)) { - int flags = get_default_free_blocks_flags(inode); - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); + if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } *partial_cluster = 0; } @@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; long long partial_cluster = 0; @@ -2845,9 +2829,10 @@ again: */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; - /* find extent for this block */ + /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); @@ -2867,6 +2852,7 @@ again: } ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split @@ -2874,8 +2860,19 @@ again: * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ - if (end >= ee_block && - end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not @@ -2886,6 +2883,24 @@ again: end + 1, 1); if (err < 0) goto out; + + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + /* + * If there's an extent to the right its first cluster + * contains the immediate right boundary of the + * truncated/punched region. Set partial_cluster to + * its negative value so it won't be freed if shared + * with the current extent. The end < ee_block case + * is handled in ext4_ext_rm_leaf(). + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + &ex); + if (err) + goto out; + if (pblk) + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); } } /* @@ -2996,16 +3011,18 @@ again: trace_ext4_ext_remove_space_done(inode, start, end, depth, partial_cluster, path->p_hdr->eh_entries); - /* If we still have something in the partial cluster and we have removed + /* + * If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial - * cluster as well. */ - if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { - int flags = get_default_free_blocks_flags(inode); - + * cluster as well. (This code will only run when there are no leaves + * to the immediate left of the truncated/punched region.) + */ + if (partial_cluster > 0 && err == 0) { + /* don't zero partial_cluster since it's not used afterwards */ ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(EXT4_SB(sb), partial_cluster), - EXT4_SB(sb)->s_cluster_ratio, flags); - partial_cluster = 0; + EXT4_C2B(sbi, partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); } /* TODO: flexible tree reduction should be here */ @@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; int set_unwritten = 0; + bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* * Okay, we need to do block allocation. */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); @@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4523,7 +4535,7 @@ got_allocated_blocks: */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (map_from_cluster) { if (reserved_clusters) { /* * We have clusters reserved for this range. @@ -4620,7 +4632,6 @@ out2: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); - ext4_es_lru_add(inode); return err ? err : allocated; } @@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ext4_has_inline_data(inode)) { int has_inline = 1; - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, + start, len); if (has_inline) return error; @@ -5154,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); + return __generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) return -EBADR; @@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - ext4_es_lru_add(inode); return error; } @@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EIO; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); - if (!ex_last) - return -EIO; err = ext4_access_path(handle, inode, path + depth); if (err) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 94e7855ae71b..e04d45733976 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan); -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei); +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -298,6 +297,36 @@ out: trace_ext4_es_find_delayed_extent_range_exit(inode, es); } +static void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +static void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + static struct extent_status * ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) @@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, * We don't count delayed extent because we never try to reclaim them */ if (!ext4_es_is_delayed(es)) { - EXT4_I(inode)->i_es_lru_nr++; + if (!EXT4_I(inode)->i_es_shk_nr++) + ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; @@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the lru counter when this es is not delayed */ + /* Decrease the shrink counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { - BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); - EXT4_I(inode)->i_es_lru_nr--; + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + if (!--EXT4_I(inode)->i_es_shk_nr) + ext4_es_list_del(inode); percpu_counter_dec(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (ext4_es_type(es1) != ext4_es_type(es2)) return 0; if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { @@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es1, es)) { es1->es_len += es->es_len; + if (ext4_es_is_referenced(es)) + ext4_es_set_referenced(es1); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); es = es1; @@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es, es1)) { es->es_len += es1->es_len; + if (ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es); rb_erase(node, &tree->root); ext4_es_free_extent(inode, es1); } @@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, goto error; retry: err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; @@ -782,6 +817,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + if (!ext4_es_is_referenced(es)) + ext4_es_set_referenced(es); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; @@ -841,8 +878,8 @@ retry: es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; if ((err == -ENOMEM) && - __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; goto out; } @@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, end = lblk + len - 1; BUG_ON(end < lblk); + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); @@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct ext4_inode_info *eia, *eib; - eia = list_entry(a, struct ext4_inode_info, i_es_lru); - eib = list_entry(b, struct ext4_inode_info, i_es_lru); - - if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return 1; - if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return -1; - if (eia->i_touch_when == eib->i_touch_when) - return 0; - if (time_after(eia->i_touch_when, eib->i_touch_when)) - return 1; - else - return -1; -} - -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei) +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; - struct list_head *cur, *tmp; - LIST_HEAD(skipped); ktime_t start_time; u64 scan_time; + int nr_to_walk; int nr_shrunk = 0; - int retried = 0, skip_precached = 1, nr_skipped = 0; + int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); - spin_lock(&sbi->s_es_lru_lock); retry: - list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - int shrunk; - - /* - * If we have already reclaimed all extents from extent - * status tree, just stop the loop immediately. - */ - if (percpu_counter_read_positive( - &es_stats->es_stats_lru_cnt) == 0) - break; - - ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move_tail(&ei->i_es_list, &sbi->s_es_list); /* - * Skip the inode that is newer than the last_sorted - * time. Normally we try hard to avoid shrinking - * precached inodes, but we will as a last resort. + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. */ - if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || - (skip_precached && ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED))) { + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; - list_move_tail(cur, &skipped); continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei || - !write_trylock(&ei->i_es_lock)) + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); - shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); - if (ei->i_es_lru_nr == 0) - list_del_init(&ei->i_es_lru); + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); write_unlock(&ei->i_es_lock); - nr_shrunk += shrunk; - nr_to_scan -= shrunk; - if (nr_to_scan == 0) - break; + if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); } - - /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skipped, &sbi->s_es_lru); - INIT_LIST_HEAD(&skipped); + spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any - * forward progress, sort the list and try again. + * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - es_stats->es_stats_last_sorted = jiffies; - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, - i_es_lru); - /* - * If there are no non-precached inodes left on the - * list, start releasing precached extents. - */ - if (ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED)) - skip_precached = 0; goto retry; } - spin_unlock(&sbi->s_es_lru_lock); - if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); +out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + @@ -1043,7 +1046,7 @@ retry: else es_stats->es_stats_shrunk = nr_shrunk; - trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, nr_skipped, retried); return nr_shrunk; } @@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; - nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; @@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; /* here we just find an inode that has the max nr. of objects */ - spin_lock(&sbi->s_es_lru_lock); - list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } - spin_unlock(&sbi->s_es_lru_lock); + spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), - percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lu/%lu cache hits/misses\n", es_stats->es_stats_cache_hits, es_stats->es_stats_cache_misses); - if (es_stats->es_stats_last_sorted != 0) - seq_printf(seq, " %u ms last sorted interval\n", - jiffies_to_msecs(jiffies - - es_stats->es_stats_last_sorted)); if (inode_cnt) - seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); @@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "maximum:\n %lu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", - max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; @@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; - INIT_LIST_HEAD(&sbi->s_es_lru); - spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_stats.es_stats_last_sorted = 0; + /* Make sure we have enough bits for physical block number */ + BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; sbi->s_es_stats.es_stats_cache_hits = 0; sbi->s_es_stats.es_stats_cache_misses = 0; @@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) return err; - err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err1; @@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) return 0; err2: - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); err1: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); return err; @@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) if (sbi->s_proc) remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); } -void ext4_es_lru_add(struct inode *inode) +/* + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at + * most *nr_to_scan extents, update *nr_to_scan accordingly. + * + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. + * Increment *nr_shrunk by the number of reclaimed extents. Also update + * ei->i_es_shrink_lblk to where we should continue scanning. + */ +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + int *nr_to_scan, int *nr_shrunk) { - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - ei->i_touch_when = jiffies; - - if (!list_empty(&ei->i_es_lru)) - return; + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct extent_status *es; + struct rb_node *node; - spin_lock(&sbi->s_es_lru_lock); - if (list_empty(&ei->i_es_lru)) - list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - spin_unlock(&sbi->s_es_lru_lock); -} + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); + if (!es) + goto out_wrap; + node = &es->rb_node; + while (*nr_to_scan > 0) { + if (es->es_lblk > end) { + ei->i_es_shrink_lblk = end + 1; + return 0; + } -void ext4_es_lru_del(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + (*nr_to_scan)--; + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (ext4_es_is_delayed(es)) + goto next; + if (ext4_es_is_referenced(es)) { + ext4_es_clear_referenced(es); + goto next; + } - spin_lock(&sbi->s_es_lru_lock); - if (!list_empty(&ei->i_es_lru)) - list_del_init(&ei->i_es_lru); - spin_unlock(&sbi->s_es_lru_lock); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + (*nr_shrunk)++; +next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); + } + ei->i_es_shrink_lblk = es->es_lblk; + return 1; +out_wrap: + ei->i_es_shrink_lblk = 0; + return 0; } -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan) +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) { struct inode *inode = &ei->vfs_inode; - struct ext4_es_tree *tree = &ei->i_es_tree; - struct rb_node *node; - struct extent_status *es; - unsigned long nr_shrunk = 0; + int nr_shrunk = 0; + ext4_lblk_t start = ei->i_es_shrink_lblk; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (ei->i_es_lru_nr == 0) + if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && __ratelimit(&_rs)) ext4_warning(inode->i_sb, "forced shrink of precached extents"); - node = rb_first(&tree->root); - while (node != NULL) { - es = rb_entry(node, struct extent_status, rb_node); - node = rb_next(&es->rb_node); - /* - * We can't reclaim delayed extent from status tree because - * fiemap, bigallic, and seek_data/hole need to use it. - */ - if (!ext4_es_is_delayed(es)) { - rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(inode, es); - nr_shrunk++; - if (--nr_to_scan == 0) - break; - } - } - tree->cache_es = NULL; + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && + start != 0) + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); + + ei->i_es_tree.cache_es = NULL; return nr_shrunk; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index efd5f970b501..691b52613ce4 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,25 +29,28 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define ES_SHIFT 60 - -#define EXTENT_STATUS_WRITTEN (1 << 3) -#define EXTENT_STATUS_UNWRITTEN (1 << 2) -#define EXTENT_STATUS_DELAYED (1 << 1) -#define EXTENT_STATUS_HOLE (1 << 0) +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; -#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ - EXTENT_STATUS_UNWRITTEN | \ - EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) -#define ES_WRITTEN (1ULL << 63) -#define ES_UNWRITTEN (1ULL << 62) -#define ES_DELAYED (1ULL << 61) -#define ES_HOLE (1ULL << 60) +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) -#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ - ES_DELAYED | ES_HOLE) +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; @@ -65,14 +68,13 @@ struct ext4_es_tree { }; struct ext4_es_stats { - unsigned long es_stats_last_sorted; unsigned long es_stats_shrunk; unsigned long es_stats_cache_hits; unsigned long es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; - struct percpu_counter es_stats_lru_cnt; + struct percpu_counter es_stats_shk_cnt; }; extern int __init ext4_init_es(void); @@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & ES_WRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & ES_UNWRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & ES_DELAYED) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & ES_HOLE) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } -static inline unsigned int ext4_es_status(struct extent_status *es) +static inline void ext4_es_set_referenced(struct extent_status *es) { - return es->es_pblk >> ES_SHIFT; + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) @@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es, static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (es->es_pblk & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (pb & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); -extern void ext4_es_lru_add(struct inode *inode); -extern void ext4_es_lru_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af3..513c12cf444c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * we determine this extent as a data or a hole according to whether the * page cache has data or not. */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int whence, - struct ext4_map_blocks *map, - loff_t *offset) +static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, + loff_t endoff, loff_t *offset) { struct pagevec pvec; - unsigned int blkbits; pgoff_t index; pgoff_t end; - loff_t endoff; loff_t startoff; loff_t lastoff; int found = 0; - blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -408,147 +403,144 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret = 0; + struct fiemap_extent_info fie; + struct fiemap_extent ext[2]; + loff_t next; + int i, ret = 0; mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { + if (offset >= inode->i_size) { mutex_unlock(&inode->i_mutex); return -ENXIO; } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + fie.fi_flags = 0; + fie.fi_extents_max = 2; + fie.fi_extents_start = (struct fiemap_extent __user *) &ext; + while (1) { + mm_segment_t old_fs = get_fs(); + + fie.fi_extents_mapped = 0; + memset(ext, 0, sizeof(*ext) * fie.fi_extents_max); + + set_fs(get_ds()); + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); + set_fs(old_fs); + if (ret) break; - } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + /* No extents found, EOF */ + if (!fie.fi_extents_mapped) { + ret = -ENXIO; break; } + for (i = 0; i < fie.fi_extents_mapped; i++) { + next = (loff_t)(ext[i].fe_length + ext[i].fe_logical); - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } + if (offset < (loff_t)ext[i].fe_logical) + offset = (loff_t)ext[i].fe_logical; + /* + * If extent is not unwritten, then it contains valid + * data, mapped or delayed. + */ + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) + goto out; - last++; - dataoff = (loff_t)last << blkbits; - } while (last <= end); + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + next, &offset)) + goto out; + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) { + ret = -ENXIO; + goto out; + } + offset = next; + } + } + if (offset > inode->i_size) + offset = inode->i_size; +out: mutex_unlock(&inode->i_mutex); + if (ret) + return ret; - if (dataoff > isize) - return -ENXIO; - - return vfs_setpos(file, dataoff, maxsize); + return vfs_setpos(file, offset, maxsize); } /* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + * ext4_seek_hole() retrieves the offset for SEEK_HOLE */ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret = 0; + struct fiemap_extent_info fie; + struct fiemap_extent ext[2]; + loff_t next; + int i, ret = 0; mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { + if (offset >= inode->i_size) { mutex_unlock(&inode->i_mutex); return -ENXIO; } - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; + fie.fi_flags = 0; + fie.fi_extents_max = 2; + fie.fi_extents_start = (struct fiemap_extent __user *)&ext; + while (1) { + mm_segment_t old_fs = get_fs(); - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } + fie.fi_extents_mapped = 0; + memset(ext, 0, sizeof(*ext)); - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; - } + set_fs(get_ds()); + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); + set_fs(old_fs); + if (ret) + break; - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; + /* No extents found */ + if (!fie.fi_extents_mapped) + break; + + for (i = 0; i < fie.fi_extents_mapped; i++) { + next = (loff_t)(ext[i].fe_logical + ext[i].fe_length); + /* + * If extent is not unwritten, then it contains valid + * data, mapped or delayed. + */ + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) { + if (offset < (loff_t)ext[i].fe_logical) + goto out; + offset = next; continue; } - } - - /* find a hole */ - break; - } while (last <= end); + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + next, &offset)) + goto out; + offset = next; + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) + goto out; + } + } + if (offset > inode->i_size) + offset = inode->i_size; +out: mutex_unlock(&inode->i_mutex); + if (ret) + return ret; - if (holeoff > isize) - holeoff = isize; - - return vfs_setpos(file, holeoff, maxsize); + return vfs_setpos(file, offset, maxsize); } /* diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3ea62695abce..4b143febf21f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, ret = __block_write_begin(page, 0, inline_size, ext4_da_get_block_prep); if (ret) { + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); ext4_truncate_failed_write(inode); - goto out; + return ret; } SetPageDirty(page); @@ -870,6 +873,12 @@ retry_journal: goto out_journal; } + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + if (ret == -ENOSPC) { ret = ext4_da_convert_inline_data_to_extent(mapping, inode, @@ -882,11 +891,6 @@ retry_journal: goto out; } - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { @@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline) + int *has_inline, __u64 start, __u64 len) { __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + __u64 inline_len; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; int error = 0; struct ext4_iloc iloc; @@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode, *has_inline = 0; goto out; } + inline_len = min_t(size_t, ext4_get_inline_size(inode), + i_size_read(inode)); + if (start >= inline_len) + goto out; + if (start + len < inline_len) + inline_len = start + len; + inline_len -= start; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode, physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - length = i_size_read(inode); if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3356ab5395f4..5653fa42930b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); - /* - * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag - * because it shouldn't be marked in es_map->m_flags. - */ - map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); /* * We don't check m_len because extent will be collpased in status @@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1393,7 +1387,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); @@ -1434,24 +1427,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_has_inline_data(inode)) { - /* - * We will soon create blocks for this page, and let - * us pretend as if the blocks aren't allocated yet. - * In case of clusters, we have to handle the work - * of mapping from cluster so that the reserved space - * is calculated properly. - */ - if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if (ext4_has_inline_data(inode)) retval = 0; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + retval = ext4_ind_map_blocks(NULL, inode, map, 0); add_delayed: if (retval == 0) { @@ -1465,7 +1446,8 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + !ext4_find_delalloc_cluster(inode, map->m_lblk)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { /* not enough space to reserve */ @@ -1481,11 +1463,6 @@ add_delayed: goto out_unlock; } - /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served - * and it should not appear on the bh->b_state. - */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; - map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); @@ -3643,7 +3620,7 @@ out_stop: * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the + * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfda18a15592..f58a0d106726 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode1); - ext4_es_lru_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbfe15c2533c..8d1e60214ef0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) if (sbi->s_group_info) { memcpy(new_groupinfo, sbi->s_group_info, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } sbi->s_group_info = new_groupinfo; sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); @@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - meta_group_info = kmalloc(metalen, GFP_KERNEL); + meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); @@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; @@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, { struct buffer_head *bh; meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); + kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); BUG_ON(bh == NULL); @@ -2495,7 +2495,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - if (sbi->s_buddy_cache) - iput(sbi->s_buddy_cache); + iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a432634f2e6a..3cb267aee802 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -592,7 +592,7 @@ err_out: /* * set the i_blocks count to zero - * so that the ext4_delete_inode does the + * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 9f2311bc9c4f..503ea15dc5db 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -273,6 +273,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + struct super_block *sb = orig_inode->i_sb; /* * It needs twice the amount of ordinary journal buffers because @@ -405,10 +406,13 @@ unlock_pages: page_cache_release(pagep[1]); stop_journal: ext4_journal_stop(handle); + if (*err == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ - if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, - &retries)) + if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 426211882f72..2291923dae4e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2814,7 +2814,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); - retval = 0; end_unlink: brelse(bh); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ca4588388fc3..bf76f405a5f9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_super(handle, sb); @@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - ext4_kvfree(n_group_desc); + kvfree(n_group_desc); brelse(iloc.bh); exit_dind: brelse(dind); @@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 63e802b8ec68..43c92b1685cb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) return ret; } -void ext4_kvfree(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); - -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_group_desc); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); - INIT_LIST_HEAD(&ei->i_es_lru); + INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; - ei->i_es_lru_nr = 0; - ei->i_touch_when = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -973,7 +964,6 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1153,7 +1143,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, + Opt_max_dir_size_kb, Opt_nojournal_checksum, }; static const match_table_t tokens = { @@ -1187,6 +1177,7 @@ static const match_table_t tokens = { {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, + {Opt_nojournal_checksum, "nojournal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, @@ -1368,6 +1359,8 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1709,6 +1702,12 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " + "in data=ordered mode"); + return 0; + } return 1; } @@ -1946,7 +1945,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) memcpy(new_groups, sbi->s_flex_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups))); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); } sbi->s_flex_groups = new_groups; sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); @@ -3317,7 +3316,7 @@ int ext4_calculate_overhead(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_KERNEL); + char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; @@ -3345,8 +3344,8 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the journal blocks as well */ - if (sbi->s_journal) + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); sbi->s_overhead = overhead; @@ -4232,7 +4231,7 @@ failed_mount7: failed_mount6: ext4_mb_release(sb); if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -4261,7 +4260,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -4862,6 +4861,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ + test_opt(sb, JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "changing journal_checksum " + "during remount not supported"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1df94fabe4eb..b96bd8076b70 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); - if (journal->j_inode) - iput(journal->j_inode); + iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index ec881b312700..2f389ce5023c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -61,6 +61,11 @@ module_param(mem_size, ulong, 0400); MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); +static unsigned int mem_type; +module_param(mem_type, uint, 0600); +MODULE_PARM_DESC(mem_type, + "set to 1 to try to use unbuffered memory (default 0)"); + static int dump_oops = 1; module_param(dump_oops, int, 0600); MODULE_PARM_DESC(dump_oops, @@ -79,6 +84,7 @@ struct ramoops_context { struct persistent_ram_zone *fprz; phys_addr_t phys_addr; unsigned long size; + unsigned int memtype; size_t record_size; size_t console_size; size_t ftrace_size; @@ -366,7 +372,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, size_t sz = cxt->record_size; cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, - &cxt->ecc_info); + &cxt->ecc_info, + cxt->memtype); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -396,7 +403,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -443,6 +450,7 @@ static int ramoops_probe(struct platform_device *pdev) cxt->size = pdata->mem_size; cxt->phys_addr = pdata->mem_address; + cxt->memtype = pdata->mem_type; cxt->record_size = pdata->record_size; cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; @@ -572,6 +580,7 @@ static void ramoops_register_dummy(void) dummy_data->mem_size = mem_size; dummy_data->mem_address = mem_address; + dummy_data->mem_type = 0; dummy_data->record_size = record_size; dummy_data->console_size = ramoops_console_size; dummy_data->ftrace_size = ramoops_ftrace_size; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 9d7b9a83699e..76c3f80efdfa 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz) persistent_ram_update_header_ecc(prz); } -static void *persistent_ram_vmap(phys_addr_t start, size_t size) +static void *persistent_ram_vmap(phys_addr_t start, size_t size, + unsigned int memtype) { struct page **pages; phys_addr_t page_start; @@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) page_start = start - offset_in_page(start); page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); - prot = pgprot_noncached(PAGE_KERNEL); + if (memtype) + prot = pgprot_noncached(PAGE_KERNEL); + else + prot = pgprot_writecombine(PAGE_KERNEL); pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) return vaddr; } -static void *persistent_ram_iomap(phys_addr_t start, size_t size) +static void *persistent_ram_iomap(phys_addr_t start, size_t size, + unsigned int memtype) { + void *va; + if (!request_mem_region(start, size, "persistent_ram")) { pr_err("request mem region (0x%llx@0x%llx) failed\n", (unsigned long long)size, (unsigned long long)start); @@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size) buffer_start_add = buffer_start_add_locked; buffer_size_add = buffer_size_add_locked; - return ioremap(start, size); + if (memtype) + va = ioremap(start, size); + else + va = ioremap_wc(start, size); + + return va; } static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, - struct persistent_ram_zone *prz) + struct persistent_ram_zone *prz, int memtype) { prz->paddr = start; prz->size = size; if (pfn_valid(start >> PAGE_SHIFT)) - prz->vaddr = persistent_ram_vmap(start, size); + prz->vaddr = persistent_ram_vmap(start, size, memtype); else - prz->vaddr = persistent_ram_iomap(start, size); + prz->vaddr = persistent_ram_iomap(start, size, memtype); if (!prz->vaddr) { pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, @@ -500,7 +512,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, struct persistent_ram_ecc_info *ecc_info) + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -511,7 +524,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, goto err; } - ret = persistent_ram_buffer_map(start, size, prz); + ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) goto err; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d571e173a990..9d6486d416a3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2772,7 +2772,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, if (journal_init_dev(sb, journal, j_dev_name) != 0) { reiserfs_warning(sb, "sh-462", - "unable to initialize jornal device"); + "unable to initialize journal device"); goto free_and_return; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b5b593c45270..538519ee37d9 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -262,6 +262,7 @@ static int write_begin_slow(struct address_space *mapping, if (err) { unlock_page(page); page_cache_release(page); + ubifs_release_budget(c, &req); return err; } } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index fb166e204441..f6ac3f29323c 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -571,7 +571,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, aligned_dlen = ALIGN(dlen, 8); aligned_ilen = ALIGN(ilen, 8); + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + /* Make sure to also account for extended attributes */ + len += host_ui->data_len; + dent = kmalloc(len, GFP_NOFS); if (!dent) return -ENOMEM; @@ -648,7 +652,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ino_key_init(c, &ino_key, dir->i_ino); ino_offs += aligned_ilen; - err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, + UBIFS_INO_NODE_SZ + host_ui->data_len); if (err) goto out_ro; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h deleted file mode 100644 index 6e247a99f5db..000000000000 --- a/fs/xfs/libxfs/xfs_ag.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_AG_H__ -#define __XFS_AG_H__ - -/* - * Allocation group header - * This is divided into three structures, placed in sequential 512-byte - * buffers after a copy of the superblock (also in a 512-byte buffer). - */ - -struct xfs_buf; -struct xfs_mount; -struct xfs_trans; - -#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ -#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ -#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ -#define XFS_AGF_VERSION 1 -#define XFS_AGI_VERSION 1 - -#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) -#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) - -/* - * Btree number 0 is bno, 1 is cnt. This value gives the size of the - * arrays below. - */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) - -/* - * The second word of agf_levels in the first a.g. overlaps the EFS - * superblock's magic number. Since the magic numbers valid for EFS - * are > 64k, our value cannot be confused for an EFS superblock's. - */ - -typedef struct xfs_agf { - /* - * Common allocation group header information - */ - __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ - __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ - __be32 agf_seqno; /* sequence # starting from 0 */ - __be32 agf_length; /* size in blocks of a.g. */ - /* - * Freespace information - */ - __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_spare0; /* spare field */ - __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ - __be32 agf_spare1; /* spare field */ - - __be32 agf_flfirst; /* first freelist block's index */ - __be32 agf_fllast; /* last freelist block's index */ - __be32 agf_flcount; /* count of blocks in freelist */ - __be32 agf_freeblks; /* total free blocks */ - - __be32 agf_longest; /* longest free space */ - __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ - uuid_t agf_uuid; /* uuid of filesystem */ - - /* - * reserve some contiguous space for future logged fields before we add - * the unlogged fields. This makes the range logging via flags and - * structure offsets much simpler. - */ - __be64 agf_spare64[16]; - - /* unlogged fields, written during buffer writeback. */ - __be64 agf_lsn; /* last write sequence */ - __be32 agf_crc; /* crc of agf sector */ - __be32 agf_spare2; - - /* structure must be padded to 64 bit alignment */ -} xfs_agf_t; - -#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) - -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_NUM_BITS 13 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) - -#define XFS_AGF_FLAGS \ - { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ - { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ - { XFS_AGF_SEQNO, "SEQNO" }, \ - { XFS_AGF_LENGTH, "LENGTH" }, \ - { XFS_AGF_ROOTS, "ROOTS" }, \ - { XFS_AGF_LEVELS, "LEVELS" }, \ - { XFS_AGF_FLFIRST, "FLFIRST" }, \ - { XFS_AGF_FLLAST, "FLLAST" }, \ - { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ - { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ - { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ - { XFS_AGF_UUID, "UUID" } - -/* disk block (xfs_daddr_t) in the AG */ -#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) -#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) - -extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); - -/* - * Size of the unlinked inode hash table in the agi. - */ -#define XFS_AGI_UNLINKED_BUCKETS 64 - -typedef struct xfs_agi { - /* - * Common allocation group header information - */ - __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ - __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ - __be32 agi_seqno; /* sequence # starting from 0 */ - __be32 agi_length; /* size in blocks of a.g. */ - /* - * Inode information - * Inodes are mapped by interpreting the inode number, so no - * mapping data is needed here. - */ - __be32 agi_count; /* count of allocated inodes */ - __be32 agi_root; /* root of inode btree */ - __be32 agi_level; /* levels in inode btree */ - __be32 agi_freecount; /* number of free inodes */ - - __be32 agi_newino; /* new inode just allocated */ - __be32 agi_dirino; /* last directory inode chunk */ - /* - * Hash table of inodes which have been unlinked but are - * still being referenced. - */ - __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; - /* - * This marks the end of logging region 1 and start of logging region 2. - */ - uuid_t agi_uuid; /* uuid of filesystem */ - __be32 agi_crc; /* crc of agi sector */ - __be32 agi_pad32; - __be64 agi_lsn; /* last write sequence */ - - __be32 agi_free_root; /* root of the free inode btree */ - __be32 agi_free_level;/* levels in free inode btree */ - - /* structure must be padded to 64 bit alignment */ -} xfs_agi_t; - -#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) - -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) -#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 - -/* disk block (xfs_daddr_t) in the AG */ -#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) -#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) - -extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); - -/* - * The third a.g. block contains the a.g. freelist, an array - * of block pointers to blocks owned by the allocation btree code. - */ -#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) -#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) - -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) - -/* - * Size of the AGFL. For CRC-enabled filesystes we steal a couple of - * slots in the beginning of the block for a proper header with the - * location information and CRC. - */ -#define XFS_AGFL_SIZE(mp) \ - (((mp)->m_sb.sb_sectsize - \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - sizeof(struct xfs_agfl) : 0)) / \ - sizeof(xfs_agblock_t)) - -typedef struct xfs_agfl { - __be32 agfl_magicnum; - __be32 agfl_seqno; - uuid_t agfl_uuid; - __be64 agfl_lsn; - __be32 agfl_crc; - __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; - -#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -/* - * tags for inode radix tree - */ -#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_ag_iterator */ -#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - -#define XFS_AGB_TO_FSB(mp,agno,agbno) \ - (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) -#define XFS_FSB_TO_AGNO(mp,fsbno) \ - ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) -#define XFS_FSB_TO_AGBNO(mp,fsbno) \ - ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) -#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ - ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ - (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) -#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) - -/* - * For checking for bad ranges of xfs_daddr_t's, covering multiple - * allocation groups or a single xfs_daddr_t that's a superblock copy. - */ -#define XFS_AG_CHECK_DADDR(mp,d,len) \ - ((len) == 1 ? \ - ASSERT((d) == XFS_SB_DADDR || \ - xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ - ASSERT(xfs_daddr_to_agno(mp, d) == \ - xfs_daddr_to_agno(mp, (d) + (len) - 1))) - -#endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index eff34218f405..a6fbf4472017 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index feacb061bab7..d1b4b6a5c894 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index e0e83e24d3ef..59d521c09a17 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -22,7 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 353fb425faef..0a472fbe06d4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -42,7 +40,6 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" -#include "xfs_dinode.h" /* * xfs_attr.c diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b1f73dbbf3d8..5d38e8b8a913 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -24,7 +24,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -41,7 +40,6 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 7510ab8058a4..20de88d1bf86 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 79c981984dca..b5eb4743f75a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -22,9 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -46,7 +44,6 @@ #include "xfs_trace.h" #include "xfs_symlink.h" #include "xfs_attr_leaf.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" @@ -5450,13 +5447,11 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { - struct xfs_ifork *ifp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; - ifp = XFS_IFORK_PTR(ip, whichfork); xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); blockcount = left.br_blockcount + got.br_blockcount; @@ -5489,32 +5484,25 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, got.br_blockcount, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); error = xfs_btree_delete(cur, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); left.br_blockcount = blockcount; - error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); - if (error) - goto out_error; - - return 0; - -out_error: - return error; + return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); } /* @@ -5544,35 +5532,29 @@ xfs_bmse_shift_one( startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock), - out_error); + XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); /* - * If this is the first extent in the file, make sure there's enough - * room at the start of the file and jump right to the shift as there's - * no left extent to merge. + * Check for merge if we've got an extent to the left, otherwise make + * sure there's enough room at the start of the file for the shift. */ - if (*current_ext == 0) { - if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - goto shift_extent; - } + if (*current_ext) { + /* grab the left extent and check for a large enough hole */ + leftp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(leftp, &left); - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (startoff < left.br_startoff + left.br_blockcount) + return -EINVAL; - if (startoff < left.br_startoff + left.br_blockcount) + /* check whether to merge the extent or shift it down */ + if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, leftp, cur, + logflags); + } + } else if (got.br_startoff < offset_shift_fsb) return -EINVAL; - /* check whether to merge the extent or shift it down */ - if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) - goto shift_extent; - - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext, - gotp, leftp, cur, logflags); - -shift_extent: /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. @@ -5589,18 +5571,11 @@ shift_extent: got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + XFS_WANT_CORRUPTED_RETURN(i == 1); got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, got.br_blockcount, got.br_state); - if (error) - return error; - - return 0; - -out_error: - return error; } /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fba753308f31..2c44c8e50782 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -36,7 +34,6 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * Determine the extent state. diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 8fe6a93ff473..81cad433df85 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index fd827530afec..9cb0115c6bd1 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -514,7 +512,6 @@ xfs_da3_root_split( struct xfs_buf *bp; struct xfs_inode *dp; struct xfs_trans *tp; - struct xfs_mount *mp; struct xfs_dir2_leaf *leaf; xfs_dablk_t blkno; int level; @@ -534,7 +531,6 @@ xfs_da3_root_split( dp = args->dp; tp = args->trans; - mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) return error; @@ -2342,14 +2338,12 @@ xfs_da_shrink_inode( xfs_inode_t *dp; int done, error, w, count; xfs_trans_t *tp; - xfs_mount_t *mp; trace_xfs_da_shrink_inode(args); dp = args->dp; w = args->whichfork; tp = args->trans; - mp = dp->i_mount; count = args->geo->fsbcount; for (;;) { /* diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 7e42fdfd2f1d..9d624a622946 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h deleted file mode 100644 index 623bbe8fd921..000000000000 --- a/fs/xfs/libxfs/xfs_dinode.h +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DINODE_H__ -#define __XFS_DINODE_H__ - -#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) - -typedef struct xfs_timestamp { - __be32 t_sec; /* timestamp seconds */ - __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; - -/* - * On-disk inode structure. - * - * This is just the header or "dinode core", the inode is expanded to fill a - * variable size the leftover area split into a data and an attribute fork. - * The format of the data and attribute fork depends on the format of the - * inode as indicated by di_format and di_aformat. To access the data and - * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros - * below. - * - * There is a very similar struct icdinode in xfs_inode which matches the - * layout of the first 96 bytes of this structure, but is kept in native - * format instead of big endian. - * - * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed - * padding field for v3 inodes. - */ -typedef struct xfs_dinode { - __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __be16 di_mode; /* mode and type of file */ - __u8 di_version; /* inode version */ - __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ - __be32 di_uid; /* owner's user id */ - __be32 di_gid; /* owner's group id */ - __be32 di_nlink; /* number of links to file */ - __be16 di_projid_lo; /* lower part of owner's project id */ - __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ - xfs_timestamp_t di_atime; /* time last accessed */ - xfs_timestamp_t di_mtime; /* time last modified */ - xfs_timestamp_t di_ctime; /* time created/inode modified */ - __be64 di_size; /* number of bytes in file */ - __be64 di_nblocks; /* # of direct & btree blocks used */ - __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ - __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ - __s8 di_aformat; /* format of attr fork's data */ - __be32 di_dmevmask; /* DMIG event mask */ - __be16 di_dmstate; /* DMIG state info */ - __be16 di_flags; /* random flags, XFS_DIFLAG_... */ - __be32 di_gen; /* generation number */ - - /* di_next_unlinked is the only non-core field in the old dinode */ - __be32 di_next_unlinked;/* agi unlinked list ptr */ - - /* start of the extended dinode, writable fields */ - __le32 di_crc; /* CRC of the inode */ - __be64 di_changecount; /* number of attribute changes */ - __be64 di_lsn; /* flush sequence */ - __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ - - /* fields only written to during inode creation */ - xfs_timestamp_t di_crtime; /* time created */ - __be64 di_ino; /* inode number */ - uuid_t di_uuid; /* UUID of the filesystem */ - - /* structure must be padded to 64 bit alignment */ -} xfs_dinode_t; - -#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) - -#define DI_MAX_FLUSH 0xffff - -/* - * Size of the core inode on disk. Version 1 and 2 inodes have - * the same size, but version 3 has grown a few additional fields. - */ -static inline uint xfs_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_dinode); - return offsetof(struct xfs_dinode, di_crc); -} - -/* - * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. - * Since the pathconf interface is signed, we use 2^31 - 1 instead. - * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. - */ -#define XFS_MAXLINK ((1U << 31) - 1U) -#define XFS_MAXLINK_1 65535U - -/* - * Values for di_format - */ -typedef enum xfs_dinode_fmt { - XFS_DINODE_FMT_DEV, /* xfs_dev_t */ - XFS_DINODE_FMT_LOCAL, /* bulk data */ - XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ - XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* uuid_t */ -} xfs_dinode_fmt_t; - -/* - * Inode minimum and maximum sizes. - */ -#define XFS_DINODE_MIN_LOG 8 -#define XFS_DINODE_MAX_LOG 11 -#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) -#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) - -/* - * Inode size for given fs. - */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) - -/* - * Inode data & attribute fork sizes, per inode. - */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) -#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) - -#define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) -#define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) -#define XFS_DFORK_SIZE(dip,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_DFORK_DSIZE(dip, mp) : \ - XFS_DFORK_ASIZE(dip, mp)) - -/* - * Return pointers to the data or attribute forks. - */ -#define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) -#define XFS_DFORK_APTR(dip) \ - (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) -#define XFS_DFORK_PTR(dip,w) \ - ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) - -#define XFS_DFORK_FORMAT(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - (dip)->di_format : \ - (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) - -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) - -/* - * For block and character special files the 32bit dev_t is stored at the - * beginning of the data fork. - */ -static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) -{ - return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); -} - -static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) -{ - *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); -} - -/* - * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. - */ -#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ -#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ -#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ -#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ -#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ -#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ -#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ -#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ -#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ -#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ -#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ -#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ -#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ -#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ -#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ -#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) -#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) -#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) -#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) -#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) -#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) -#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) -#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) -#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) -#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) -#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) -#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) -#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) -#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) -#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) - -#ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) -#else -#define XFS_IS_REALTIME_INODE(ip) (0) -#endif - -#define XFS_DIFLAG_ANY \ - (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ - XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ - XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ - XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) - -#endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 7075aaf131f4..a69fb3a1e161 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -20,9 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -34,10 +31,25 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_dinode.h" struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; /* * ASCII case-insensitive (ie. A-Z) support for directories that was diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 4dff261e6ed5..e55353651f5b 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -32,6 +32,12 @@ struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* + * directory filetype conversion tables. + */ +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +/* * directory operations vector for encode/decode routines */ struct xfs_dir_ops { @@ -177,4 +183,138 @@ extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9628ceccfa02..9354e190b82e 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -36,7 +34,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * Local function prototypes. @@ -353,7 +350,6 @@ xfs_dir2_block_addname( int low; /* low index for binary srch */ int lowstale; /* low stale index */ int mid=0; /* midpoint for binary srch */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log header */ int needscan; /* need to rescan freespace */ __be16 *tagp; /* pointer to tag value */ @@ -363,7 +359,6 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; - mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ error = xfs_dir3_block_read(tp, dp, &bp); @@ -618,7 +613,6 @@ xfs_dir2_block_lookup( xfs_inode_t *dp; /* incore inode */ int ent; /* entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ trace_xfs_dir2_block_lookup(args); @@ -629,7 +623,6 @@ xfs_dir2_block_lookup( if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) return error; dp = args->dp; - mp = dp->i_mount; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); @@ -770,7 +763,6 @@ xfs_dir2_block_removename( xfs_inode_t *dp; /* incore inode */ int ent; /* block leaf entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to fixup bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ @@ -788,7 +780,6 @@ xfs_dir2_block_removename( } dp = args->dp; tp = args->trans; - mp = dp->i_mount; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -852,7 +843,6 @@ xfs_dir2_block_replace( xfs_inode_t *dp; /* incore inode */ int ent; /* leaf entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ trace_xfs_dir2_block_replace(args); @@ -864,7 +854,6 @@ xfs_dir2_block_replace( return error; } dp = args->dp; - mp = dp->i_mount; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index fdd803fecb8e..5ff31be9b1cd 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a19174eb3cb2..106119955400 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -384,7 +382,6 @@ xfs_dir2_block_to_leaf( xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ @@ -395,7 +392,6 @@ xfs_dir2_block_to_leaf( trace_xfs_dir2_block_to_leaf(args); dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Add the leaf block to the inode. @@ -626,7 +622,6 @@ xfs_dir2_leaf_addname( int lfloghigh; /* high leaf logging index */ int lowstale; /* index of prev stale leaf */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ @@ -641,7 +636,6 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; - mp = dp->i_mount; error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) @@ -1356,11 +1350,9 @@ xfs_dir2_leaf_removename( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; /* bestfree table */ struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; @@ -1374,8 +1366,6 @@ xfs_dir2_leaf_removename( return error; } dp = args->dp; - tp = args->trans; - mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); @@ -1607,11 +1597,9 @@ xfs_dir2_leaf_trim_data( int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Read the offending data block. We need its buffer. diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 2ae6ac2c11ae..41b80d3d3877 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -297,7 +295,6 @@ xfs_dir2_leaf_to_node( int i; /* leaf freespace index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ int n; /* count of live freespc ents */ xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ @@ -307,7 +304,6 @@ xfs_dir2_leaf_to_node( trace_xfs_dir2_leaf_to_node(args); dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Add a freespace block to the directory. @@ -387,16 +383,12 @@ xfs_dir2_leafn_add( int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ int lowstale; /* previous stale entry */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icleaf_hdr leafhdr; struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); dp = args->dp; - mp = dp->i_mount; - tp = args->trans; leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); @@ -1170,7 +1162,6 @@ xfs_dir2_leafn_remove( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int longest; /* longest data free entry */ int off; /* data block entry offset */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ @@ -1182,7 +1173,6 @@ xfs_dir2_leafn_remove( dp = args->dp; tp = args->trans; - mp = dp->i_mount; leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); @@ -1323,7 +1313,6 @@ xfs_dir2_leafn_split( xfs_da_args_t *args; /* operation arguments */ xfs_dablk_t blkno; /* new leaf block number */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ struct xfs_inode *dp; /* @@ -1331,7 +1320,6 @@ xfs_dir2_leafn_split( */ args = state->args; dp = args->dp; - mp = dp->i_mount; ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -2231,12 +2219,10 @@ xfs_dir2_node_trim_free( xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Read the freespace block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 27ce0794d196..ef9f6ead96a4 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -20,140 +20,6 @@ struct dir_context; -/* - * Directory offset/block conversion functions. - * - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t)(by >> geo->blklog); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)db << geo->blklog) + o; -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); -} - -/* - * Directory tail pointer accessor functions. Based on block geometry. - */ -static inline struct xfs_dir2_block_tail * -xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir2_block_tail *) - ((char *)hdr + geo->blksize)) - 1; -} - -static inline struct xfs_dir2_leaf_tail * -xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) -{ - return (struct xfs_dir2_leaf_tail *) - ((char *)lp + geo->blksize - - sizeof(struct xfs_dir2_leaf_tail)); -} - /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -161,12 +27,6 @@ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); -#define S_SHIFT 12 -extern const unsigned char xfs_mode_to_ftype[]; - -extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, - __uint8_t filetype); - /* xfs_dir2_block.c */ extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 5079e051ef08..974d62e677f4 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -32,7 +30,6 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" -#include "xfs_dinode.h" /* * Prototypes for internal functions. @@ -455,13 +452,11 @@ xfs_dir2_sf_addname_hard( xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ - struct xfs_mount *mp; /* * Copy the old directory to the stack buffer. */ dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; @@ -542,7 +537,6 @@ xfs_dir2_sf_addname_pick( xfs_inode_t *dp; /* incore directory inode */ int holefit; /* found hole it will fit in */ int i; /* entry number */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ @@ -550,7 +544,6 @@ xfs_dir2_sf_addname_pick( int used; /* data bytes used */ dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = dp->d_ops->data_entsize(args->namelen); @@ -616,10 +609,8 @@ xfs_dir2_sf_check( int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - struct xfs_mount *mp; dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = dp->d_ops->data_first_offset; @@ -1016,12 +1007,10 @@ xfs_dir2_sf_toino4( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ - struct xfs_mount *mp; trace_xfs_dir2_sf_toino4(args); dp = args->dp; - mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1094,12 +1083,10 @@ xfs_dir2_sf_toino8( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ - struct xfs_mount *mp; trace_xfs_dir2_sf_toino8(args); dp = args->dp; - mp = dp->i_mount; /* * Copy the old directory to the buffer. diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bb969337efc8..6fbf2d853a54 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 7e42bba9a420..fbd6da263571 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -34,6 +34,1077 @@ struct xfs_buf; struct xfs_ifork; /* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ +#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 +#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ + +#define XFS_SB_VERSION2_OKBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_rfsblock_t sb_dblocks; /* number of data blocks */ + xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ + xfs_rtblock_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_fsblock_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __uint32_t sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_sb_t; + +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + +/* + * Sequence number values for the fields. + */ +typedef enum { + XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, + XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, + XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, + XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, + XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, + XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, + XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, + XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, + XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, + XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, + XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, + XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, + XFS_SBS_FIELDCOUNT +} xfs_sb_field_t; + +/* + * Mask values, defined based on the xfs_sb_field_t values. + * Only define the ones we're using. + */ +#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) +#define XFS_SB_UUID XFS_SB_MVAL(UUID) +#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) +#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) +#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) +#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) +#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) +#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) +#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) +#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) +#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) +#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) +#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) +#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) +#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) +#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) +#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) +#define XFS_SB_MOD_BITS \ + (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ + XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ + XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) + + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) + +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) +{ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; +} + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; +} + +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +{ + return sbp->sb_bad_features2 != sbp->sb_features2; +} + +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); +} + +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; +} + +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); +} + +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; +} + +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); +} + +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); +} + +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); +} + +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); +} + +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); +} + +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); +} + +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); +} + +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; +} + +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); +} + +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; +} + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; +} + +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); +} + +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + +/* + * end of superblock version macros + */ + +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) + +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) +#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) +#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ + XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +/* + * Allocation group header + * + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 + +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf { + /* + * Common allocation group header information + */ + __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ + __be32 agf_seqno; /* sequence # starting from 0 */ + __be32 agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __be32 agf_spare0; /* spare field */ + __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_spare1; /* spare field */ + + __be32 agf_flfirst; /* first freelist block's index */ + __be32 agf_fllast; /* last freelist block's index */ + __be32 agf_flcount; /* count of blocks in freelist */ + __be32 agf_freeblks; /* total free blocks */ + + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ +} xfs_agf_t; + +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi { + /* + * Common allocation group header information + */ + __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ + __be32 agi_seqno; /* sequence # starting from 0 */ + __be32 agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + __be32 agi_count; /* count of allocated inodes */ + __be32 agi_root; /* root of inode btree */ + __be32 agi_level; /* levels in inode btree */ + __be32 agi_freecount; /* number of free inodes */ + + __be32 agi_newino; /* new inode just allocated */ + __be32 agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ +} xfs_agi_t; + +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) + +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + + +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#define XFS_MIN_FREELIST(a,mp) \ + (XFS_MIN_FREELIST_RAW( \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + (XFS_MIN_FREELIST_RAW( \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) + +typedef struct xfs_timestamp { + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. + */ +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + +#define DI_MAX_FLUSH 0xffff + +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) + +/* + * Inode data & attribute fork sizes, per inode. + */ +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) + +#define XFS_DFORK_DSIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version)) +#define XFS_DFORK_ASIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) + +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)dip + xfs_dinode_size(dip->di_version)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#define XFS_DFORK_FORMAT(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ +#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) +#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ +#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog +#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog +#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log +#define XFS_INO_BITS(mp) \ + XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) + +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +/* * RealTime Device format definitions */ @@ -413,4 +1484,40 @@ struct xfs_btree_block { #define XFS_BTREE_LBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.l.bb_crc) +/* + * On-disk XFS access control list structure. + */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; +}; + +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + #endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 23dcb72fc5e6..116ef1ddb3e3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -22,9 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -39,7 +37,6 @@ #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_trace.h" @@ -48,12 +45,12 @@ */ static inline int xfs_ialloc_cluster_alignment( - xfs_alloc_arg_t *args) + struct xfs_mount *mp) { - if (xfs_sb_version_hasalign(&args->mp->m_sb) && - args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) - return args->mp->m_sb.sb_inoalignmt; + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + return mp->m_sb.sb_inoalignmt; return 1; } @@ -412,7 +409,7 @@ xfs_ialloc_ag_alloc( * but not to use them in the actual exact allocation. */ args.alignment = 1; - args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; /* Allow space for the inode btree to split. */ args.minleft = args.mp->m_in_maxlevels - 1; @@ -448,7 +445,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else - args.alignment = xfs_ialloc_cluster_alignment(&args); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -477,7 +474,7 @@ xfs_ialloc_ag_alloc( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); - args.alignment = xfs_ialloc_cluster_alignment(&args); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); if ((error = xfs_alloc_vextent(&args))) return error; } @@ -632,10 +629,24 @@ xfs_ialloc_ag_select( } /* - * Is there enough free space for the file plus a block of - * inodes? (if we need to allocate some)? + * Check that there is enough free space for the file plus a + * chunk of inodes if we need to allocate some. If this is the + * first pass across the AGs, take into account the potential + * space needed for alignment of inode chunks when checking the + * longest contiguous free space in the AG - this prevents us + * from getting ENOSPC because we have free space larger than + * m_ialloc_blks but alignment constraints prevent us from using + * it. + * + * If we can't find an AG with space for full alignment slack to + * be taken into account, we must be near ENOSPC in all AGs. + * Hence we don't include alignment for the second pass and so + * if we fail allocation due to alignment issues then it is most + * likely a real ENOSPC condition. */ ineed = mp->m_ialloc_blks; + if (flags && ineed > 1) + ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -1137,11 +1148,7 @@ xfs_dialloc_ag_update_inobt( XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); - error = xfs_inobt_update(cur, &rec); - if (error) - return error; - - return 0; + return xfs_inobt_update(cur, &rec); } /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 95ad1c002d60..100007d56449 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -160,4 +160,8 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); +int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index c9b06f30fe86..964c465ca69c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f18fd2da49f7..002b6b3a1988 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" @@ -30,7 +28,6 @@ #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" -#include "xfs_dinode.h" /* * Check that none of the inode's in the buffer have a next diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 6a00f7fed69d..0defbd02f62d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -22,9 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -34,7 +31,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_attr_sf.h" -#include "xfs_dinode.h" kmem_zone_t *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h deleted file mode 100644 index 4ff2278e147a..000000000000 --- a/fs/xfs/libxfs/xfs_inum.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_INUM_H__ -#define __XFS_INUM_H__ - -/* - * Inode number format: - * low inopblog bits - offset in block - * next agblklog bits - block number in ag - * next agno_log bits - ag number - * high agno_log-agblklog-inopblog bits - 0 - */ - -struct xfs_mount; - -#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) -#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog -#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog -#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log -#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log -#define XFS_INO_BITS(mp) \ - XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) -#define XFS_INO_TO_AGNO(mp,i) \ - ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) -#define XFS_INO_TO_AGINO(mp,i) \ - ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) -#define XFS_INO_TO_AGBNO(mp,i) \ - (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ - XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) -#define XFS_INO_TO_OFFSET(mp,i) \ - ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) -#define XFS_INO_TO_FSB(mp,i) \ - XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) -#define XFS_AGINO_TO_INO(mp,a,i) \ - (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) -#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) -#define XFS_AGINO_TO_OFFSET(mp,i) \ - ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) -#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ - ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) - -#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) - -#endif /* __XFS_INUM_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index aff12f2d4428..265314690415 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -361,7 +361,7 @@ typedef struct xfs_ictimestamp { /* * NOTE: This structure must be kept identical to struct xfs_dinode - * in xfs_dinode.h except for the endianness annotations. + * except for the endianness annotations. */ typedef struct xfs_icdinode { __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index ee7e0e80246b..c10597973333 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_trans_space.h" diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 7c818f1e4484..9b59ffa1fc19 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" @@ -36,7 +34,6 @@ #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5f902fa7913f..752915fa775a 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -33,7 +32,6 @@ #include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_dinode.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 2e739708afd3..8eb1c54bafbf 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -19,590 +19,6 @@ #define __XFS_SB_H__ /* - * Super block - * Fits into a sector-sized buffer at address 0 of each allocation group. - * Only the first of these is ever updated except during growfs. - */ - -struct xfs_buf; -struct xfs_mount; -struct xfs_trans; - -#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ -#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ -#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ -#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ -#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ -#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ -#define XFS_SB_VERSION_NUMBITS 0x000f -#define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_ATTRBIT 0x0010 -#define XFS_SB_VERSION_NLINKBIT 0x0020 -#define XFS_SB_VERSION_QUOTABIT 0x0040 -#define XFS_SB_VERSION_ALIGNBIT 0x0080 -#define XFS_SB_VERSION_DALIGNBIT 0x0100 -#define XFS_SB_VERSION_SHAREDBIT 0x0200 -#define XFS_SB_VERSION_LOGV2BIT 0x0400 -#define XFS_SB_VERSION_SECTORBIT 0x0800 -#define XFS_SB_VERSION_EXTFLGBIT 0x1000 -#define XFS_SB_VERSION_DIRV2BIT 0x2000 -#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ -#define XFS_SB_VERSION_MOREBITSBIT 0x8000 - -/* - * Supported feature bit list is just all bits in the versionnum field because - * we've used them all up and understand them all. Except, of course, for the - * shared superblock bit, which nobody knows what it does and so is unsupported. - */ -#define XFS_SB_VERSION_OKBITS \ - ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ - ~XFS_SB_VERSION_SHAREDBIT) - -/* - * There are two words to hold XFS "feature" bits: the original - * word, sb_versionnum, and sb_features2. Whenever a bit is set in - * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. - * - * These defines represent bits in sb_features2. - */ -#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 -#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ -#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 -#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ -#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ -#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ -#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ -#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ - -#define XFS_SB_VERSION2_OKBITS \ - (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ - XFS_SB_VERSION2_ATTR2BIT | \ - XFS_SB_VERSION2_PROJID32BIT | \ - XFS_SB_VERSION2_FTYPE) - -/* - * Superblock - in core version. Must match the ondisk version below. - * Must be padded to 64 bit alignment. - */ -typedef struct xfs_sb { - __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __uint32_t sb_blocksize; /* logical block size, bytes */ - xfs_rfsblock_t sb_dblocks; /* number of data blocks */ - xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ - xfs_rtblock_t sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ - xfs_fsblock_t sb_logstart; /* starting block of log if internal */ - xfs_ino_t sb_rootino; /* root inode number */ - xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ - xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ - xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ - xfs_agblock_t sb_agblocks; /* size of an allocation group */ - xfs_agnumber_t sb_agcount; /* number of allocation groups */ - xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ - xfs_extlen_t sb_logblocks; /* number of log blocks */ - __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ - __uint16_t sb_sectsize; /* volume sector size, bytes */ - __uint16_t sb_inodesize; /* inode size, bytes */ - __uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ - __uint8_t sb_blocklog; /* log2 of sb_blocksize */ - __uint8_t sb_sectlog; /* log2 of sb_sectsize */ - __uint8_t sb_inodelog; /* log2 of sb_inodesize */ - __uint8_t sb_inopblog; /* log2 of sb_inopblock */ - __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __uint8_t sb_rextslog; /* log2 of sb_rextents */ - __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ - __uint8_t sb_imax_pct; /* max % of fs for inode space */ - /* statistics */ - /* - * These fields must remain contiguous. If you really - * want to change their layout, make sure you fix the - * code in xfs_trans_apply_sb_deltas(). - */ - __uint64_t sb_icount; /* allocated inodes */ - __uint64_t sb_ifree; /* free inodes */ - __uint64_t sb_fdblocks; /* free data blocks */ - __uint64_t sb_frextents; /* free realtime extents */ - /* - * End contiguous fields. - */ - xfs_ino_t sb_uquotino; /* user quota inode */ - xfs_ino_t sb_gquotino; /* group quota inode */ - __uint16_t sb_qflags; /* quota flags */ - __uint8_t sb_flags; /* misc. flags */ - __uint8_t sb_shared_vn; /* shared version number */ - xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __uint32_t sb_unit; /* stripe or raid unit */ - __uint32_t sb_width; /* stripe or raid width */ - __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ - __uint8_t sb_logsectlog; /* log2 of the log sector size */ - __uint16_t sb_logsectsize; /* sector size for the log, bytes */ - __uint32_t sb_logsunit; /* stripe unit size for the log */ - __uint32_t sb_features2; /* additional feature bits */ - - /* - * bad features2 field as a result of failing to pad the sb - * structure to 64 bits. Some machines will be using this field - * for features2 bits. Easiest just to mark it bad and not use - * it for anything else. - */ - __uint32_t sb_bad_features2; - - /* version 5 superblock fields start here */ - - /* feature masks */ - __uint32_t sb_features_compat; - __uint32_t sb_features_ro_compat; - __uint32_t sb_features_incompat; - __uint32_t sb_features_log_incompat; - - __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; - - xfs_ino_t sb_pquotino; /* project quota inode */ - xfs_lsn_t sb_lsn; /* last write sequence */ - - /* must be padded to 64 bit alignment */ -} xfs_sb_t; - -#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) - -/* - * Superblock - on disk version. Must match the in core version above. - * Must be padded to 64 bit alignment. - */ -typedef struct xfs_dsb { - __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __be32 sb_blocksize; /* logical block size, bytes */ - __be64 sb_dblocks; /* number of data blocks */ - __be64 sb_rblocks; /* number of realtime blocks */ - __be64 sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ - __be64 sb_logstart; /* starting block of log if internal */ - __be64 sb_rootino; /* root inode number */ - __be64 sb_rbmino; /* bitmap inode for realtime extents */ - __be64 sb_rsumino; /* summary inode for rt bitmap */ - __be32 sb_rextsize; /* realtime extent size, blocks */ - __be32 sb_agblocks; /* size of an allocation group */ - __be32 sb_agcount; /* number of allocation groups */ - __be32 sb_rbmblocks; /* number of rt bitmap blocks */ - __be32 sb_logblocks; /* number of log blocks */ - __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ - __be16 sb_sectsize; /* volume sector size, bytes */ - __be16 sb_inodesize; /* inode size, bytes */ - __be16 sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ - __u8 sb_blocklog; /* log2 of sb_blocksize */ - __u8 sb_sectlog; /* log2 of sb_sectsize */ - __u8 sb_inodelog; /* log2 of sb_inodesize */ - __u8 sb_inopblog; /* log2 of sb_inopblock */ - __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __u8 sb_rextslog; /* log2 of sb_rextents */ - __u8 sb_inprogress; /* mkfs is in progress, don't mount */ - __u8 sb_imax_pct; /* max % of fs for inode space */ - /* statistics */ - /* - * These fields must remain contiguous. If you really - * want to change their layout, make sure you fix the - * code in xfs_trans_apply_sb_deltas(). - */ - __be64 sb_icount; /* allocated inodes */ - __be64 sb_ifree; /* free inodes */ - __be64 sb_fdblocks; /* free data blocks */ - __be64 sb_frextents; /* free realtime extents */ - /* - * End contiguous fields. - */ - __be64 sb_uquotino; /* user quota inode */ - __be64 sb_gquotino; /* group quota inode */ - __be16 sb_qflags; /* quota flags */ - __u8 sb_flags; /* misc. flags */ - __u8 sb_shared_vn; /* shared version number */ - __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __be32 sb_unit; /* stripe or raid unit */ - __be32 sb_width; /* stripe or raid width */ - __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ - __u8 sb_logsectlog; /* log2 of the log sector size */ - __be16 sb_logsectsize; /* sector size for the log, bytes */ - __be32 sb_logsunit; /* stripe unit size for the log */ - __be32 sb_features2; /* additional feature bits */ - /* - * bad features2 field as a result of failing to pad the sb - * structure to 64 bits. Some machines will be using this field - * for features2 bits. Easiest just to mark it bad and not use - * it for anything else. - */ - __be32 sb_bad_features2; - - /* version 5 superblock fields start here */ - - /* feature masks */ - __be32 sb_features_compat; - __be32 sb_features_ro_compat; - __be32 sb_features_incompat; - __be32 sb_features_log_incompat; - - __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; - - __be64 sb_pquotino; /* project quota inode */ - __be64 sb_lsn; /* last write sequence */ - - /* must be padded to 64 bit alignment */ -} xfs_dsb_t; - -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) -#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ - XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ - XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) - - -/* - * Misc. Flags - warning - these will be cleared by xfs_repair unless - * a feature bit is set when the flag is used. - */ -#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ -#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ - -/* - * define max. shared version we can interoperate with - */ -#define XFS_SB_MAX_SHARED_VN 0 - -#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) - -/* - * The first XFS version we support is a v4 superblock with V2 directories. - */ -static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) -{ - if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) - return false; - - /* check for unknown features in the fs */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - - return true; -} - -static inline bool xfs_sb_good_version(struct xfs_sb *sbp) -{ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return true; - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - return xfs_sb_good_v4_features(sbp); - return false; -} - -/* - * Detect a mismatched features2 field. Older kernels read/wrote - * this into the wrong slot, so to be safe we keep them in sync. - */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) -{ - return sbp->sb_bad_features2 != sbp->sb_features2; -} - -static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); -} - -static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; -} - -static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); -} - -static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; -} - -static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); -} - -static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); -} - -static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); -} - -static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); -} - -static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); -} - -/* - * sb_features2 bit version macros. - */ -static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); -} - -static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); -} - -static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; - sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; - sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; -} - -static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) -{ - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; -} - -static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); -} - -static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; - sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; - sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; -} - -/* - * Extended v5 superblock feature masks. These are to be used for new v5 - * superblock features only. - * - * Compat features are new features that old kernels will not notice or affect - * and so can mount read-write without issues. - * - * RO-Compat (read only) are features that old kernels can read but will break - * if they write. Hence only read-only mounts of such filesystems are allowed on - * kernels that don't support the feature bit. - * - * InCompat features are features which old kernels will not understand and so - * must not mount. - * - * Log-InCompat features are for changes to log formats or new transactions that - * can't be replayed on older kernels. The fields are set when the filesystem is - * mounted, and a clean unmount clears the fields. - */ -#define XFS_SB_FEAT_COMPAT_ALL 0 -#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL -static inline bool -xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_compat & feature) != 0; -} - -#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ -#define XFS_SB_FEAT_RO_COMPAT_ALL \ - (XFS_SB_FEAT_RO_COMPAT_FINOBT) -#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL -static inline bool -xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_ro_compat & feature) != 0; -} - -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) - -#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL -static inline bool -xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_incompat & feature) != 0; -} - -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 -#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL -static inline bool -xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_log_incompat & feature) != 0; -} - -/* - * V5 superblock specific feature checks - */ -static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); -} - -static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); -} - -/* - * end of superblock version macros - */ - -static inline bool -xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) -{ - return (ino == sbp->sb_uquotino || - ino == sbp->sb_gquotino || - ino == sbp->sb_pquotino); -} - -#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ -#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) - -#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) -#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ - xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) -#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ - XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) - -/* - * File system sector to basic block conversions. - */ -#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) - -/* - * File system block to basic block conversions. - */ -#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) -#define XFS_BB_TO_FSB(mp,bb) \ - (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) -#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) - -/* - * File system block to byte conversions. - */ -#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSB(mp,b) \ - ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) - -/* * perag get/put wrappers for ref counting */ extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 5782f037eab4..c80c5236c3da 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f2bda7c76b8a..6c1330f29050 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index a65fa5dde6e9..4b641676f258 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -19,8 +19,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_acl.h" diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 5dc163744511..3841b07f27bf 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,42 +22,6 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_NOT_PRESENT (-1) - -/* On-disk XFS access control list structure */ -struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - __be16 ae_pad; /* fill the implicit hole in the structure */ -}; - -struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; -}; - -/* - * The number of ACL entries allowed is defined by the on-disk format. - * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is - * limited only by the maximum size of the xattr that stores the information. - */ -#define XFS_ACL_MAX_ENTRIES(mp) \ - (xfs_sb_version_hascrc(&mp->m_sb) \ - ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ - sizeof(struct xfs_acl_entry) \ - : 25) - -#define XFS_ACL_MAX_SIZE(mp) \ - (sizeof(struct xfs_acl) + \ - sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) - -/* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" -#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) -#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) - #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f5b2453a43b2..18e2f3bbae5e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -33,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index aa2a8b1838a2..83af4c149635 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -39,7 +37,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" /* diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 62db83ab6cbc..a43d370d2c58 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -39,7 +37,6 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" STATIC int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 281002689d64..22a5dcb70b32 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -42,7 +40,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" -#include "xfs_dinode.h" /* Kernel only BMAP related definitions and functions */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 24b4ebea0d4d..bb502a391792 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -34,18 +34,16 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" static kmem_zone_t *xfs_buf_zone; -static struct workqueue_struct *xfslogd_workqueue; - #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) @@ -463,7 +461,7 @@ _xfs_buf_find( * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno >= eofs) { + if (blkno < 0 || blkno >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -1043,7 +1041,7 @@ xfs_buf_ioend_work( struct work_struct *work) { struct xfs_buf *bp = - container_of(work, xfs_buf_t, b_iodone_work); + container_of(work, xfs_buf_t, b_ioend_work); xfs_buf_ioend(bp); } @@ -1052,8 +1050,8 @@ void xfs_buf_ioend_async( struct xfs_buf *bp) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_ioend_wq, &bp->b_ioend_work); } void @@ -1222,6 +1220,13 @@ _xfs_buf_ioapply( */ bp->b_error = 0; + /* + * Initialize the I/O completion workqueue if we haven't yet or the + * submitter has not opted to specify a custom one. + */ + if (!bp->b_ioend_wq) + bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; @@ -1882,15 +1887,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1); - if (!xfslogd_workqueue) - goto out_free_buf_zone; - return 0; - out_free_buf_zone: - kmem_zone_destroy(xfs_buf_zone); out: return -ENOMEM; } @@ -1898,6 +1896,5 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 82002c00af90..75ff5d5a7d2e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -164,7 +164,8 @@ typedef struct xfs_buf { struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ - struct work_struct b_iodone_work; + struct work_struct b_ioend_work; + struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f15969543326..3f9bd58edec7 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,11 +17,11 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f1b69edcdf31..098cd78fe708 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -34,7 +32,6 @@ #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" -#include "xfs_dinode.h" /* * Directory file type support functions @@ -44,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = { DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; -unsigned char +static unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, __uint8_t filetype) @@ -57,22 +54,6 @@ xfs_dir3_get_dtype( return xfs_dir3_filetype_table[filetype]; } -/* - * @mode, if set, indicates that the type field needs to be set up. - * This uses the transformation from file mode to DT_* as defined in linux/fs.h - * for file type specification. This will be propagated into the directory - * structure if appropriate for the given operation and filesystem config. - */ -const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { - [0] = XFS_DIR3_FT_UNKNOWN, - [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, - [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, -}; STATIC int xfs_dir2_sf_getdents( diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 13d08a1b390e..799e5a2d334d 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -20,7 +20,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 63c2de49f61d..02c01bbbc789 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -22,8 +22,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index f33fbaaa4d8a..814cff94e78f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b92fd7bc49e3..3ee186ac1093 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -20,8 +20,6 @@ #include "xfs_fs.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 5a6bd5d8779a..5eb4a14e0a0f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -19,10 +19,9 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_export.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index fd22f69049d4..c263e079273e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -24,7 +24,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c4327419dc5c..cb7fe64cdbfa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -17,10 +17,9 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index eb596b419942..13e974e6a889 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -37,7 +35,6 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_dinode.h" #include "xfs_icache.h" #include <linux/aio.h> @@ -933,7 +930,6 @@ xfs_file_readdir( { struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); - int error; size_t bufsize; /* @@ -950,10 +946,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - error = xfs_readdir(ip, ctx, bufsize); - if (error) - return error; - return 0; + return xfs_readdir(ip, ctx, bufsize); } STATIC int diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e92730c1d3ca..a2e86e8a0fea 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -20,16 +20,13 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_inum.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c05ac8b70fa9..fdc64220fcb0 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -22,7 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -40,7 +39,6 @@ #include "xfs_rtalloc.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b45f7b27b5df..9771b7ef62ed 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -20,9 +20,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" @@ -65,6 +63,7 @@ xfs_inode_alloc( return NULL; } + XFS_STATS_INC(vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); @@ -130,6 +129,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 46748b86b12f..62f1f91c32cb 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -34,6 +34,14 @@ struct xfs_eofblocks { #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ /* + * tags for inode radix tree + */ +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ + +/* * Flags for xfs_iget() */ #define XFS_IGET_CREATE 0x1 diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 7e4549233251..d45ca72af6fb 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -18,11 +18,10 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ed049d1e332..41f804e740d7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -23,9 +23,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_da_format.h" @@ -1082,7 +1080,7 @@ xfs_create( struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; - struct xfs_trans_res tres; + struct xfs_trans_res *tres; uint resblks; trace_xfs_create(dp, name); @@ -1105,13 +1103,11 @@ xfs_create( if (is_dir) { rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; - tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tres = &M_RES(mp)->tr_mkdir; tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_create.tr_logres; - tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tres = &M_RES(mp)->tr_create; tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } @@ -1123,17 +1119,16 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, &tres, 0, 0); + error = xfs_trans_reserve(tp, tres, 0, 0); } if (error) { cancel_flags = 0; @@ -2488,9 +2483,7 @@ xfs_remove( xfs_fsblock_t first_block; int cancel_flags; int committed; - int link_zero; uint resblks; - uint log_count; trace_xfs_remove(dp, name); @@ -2505,13 +2498,10 @@ xfs_remove( if (error) goto std_return; - if (is_dir) { + if (is_dir) tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - log_count = XFS_DEFAULT_LOG_COUNT; - } else { + else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - log_count = XFS_REMOVE_LOG_COUNT; - } cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* @@ -2579,9 +2569,6 @@ xfs_remove( if (error) goto out_trans_cancel; - /* Determine if this is the last link while the inode is locked */ - link_zero = (ip->i_d.di_nlink == 0); - xfs_bmap_init(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9af2882e1f4c..4ed2ba9342dc 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -20,7 +20,6 @@ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" -#include "xfs_dinode.h" /* * Kernel only inode definitions @@ -324,7 +323,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) - int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 63de0b0acc32..bf13a5a7e2f4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -29,7 +27,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_log.h" diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 24c926b6fe85..a1831980a68e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ioctl.h" @@ -40,7 +38,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" -#include "xfs_dinode.h" #include "xfs_trans.h" #include <linux/capability.h> diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 94ce027e28e3..ec6772866f3d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -25,8 +25,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_itable.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index afcf3c926565..c980e2a5086b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -38,7 +36,6 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_dinode.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -52,7 +49,6 @@ xfs_iomap_eof_align_last_fsb( xfs_extlen_t extsize, xfs_fileoff_t *last_fsb) { - xfs_fileoff_t new_last_fsb = 0; xfs_extlen_t align = 0; int eof, error; @@ -70,8 +66,8 @@ xfs_iomap_eof_align_last_fsb( else if (mp->m_dalign) align = mp->m_dalign; - if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) - new_last_fsb = roundup_64(*last_fsb, align); + if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) + align = 0; } /* @@ -79,14 +75,14 @@ xfs_iomap_eof_align_last_fsb( * (when file on a real-time subvolume or has di_extsize hint). */ if (extsize) { - if (new_last_fsb) - align = roundup_64(new_last_fsb, extsize); + if (align) + align = roundup_64(align, extsize); else align = extsize; - new_last_fsb = roundup_64(*last_fsb, align); } - if (new_last_fsb) { + if (align) { + xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; @@ -264,7 +260,6 @@ xfs_iomap_eof_want_preallocate( { xfs_fileoff_t start_fsb; xfs_filblks_t count_fsb; - xfs_fsblock_t firstblock; int n, error, imaps; int found_delalloc = 0; @@ -289,7 +284,6 @@ xfs_iomap_eof_want_preallocate( count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; - firstblock = NULLFSBLOCK; error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, 0); if (error) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ec6dcdc181ee..c50311cae1b1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -37,8 +35,7 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_da_btree.h" -#include "xfs_dir2_priv.h" -#include "xfs_dinode.h" +#include "xfs_dir2.h" #include "xfs_trans_space.h" #include <linux/capability.h> diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 894924a5129b..82e314258f73 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -21,9 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -33,7 +30,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_dinode.h" STATIC int xfs_internal_inum( @@ -352,7 +348,6 @@ xfs_bulkstat( int *done) /* 1 if there are more stats to get */ { xfs_buf_t *agbp; /* agi header buffer */ - xfs_agi_t *agi; /* agi header data */ xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ @@ -403,7 +398,6 @@ xfs_bulkstat( error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) break; - agi = XFS_BUF_TO_AGI(agbp); /* * Allocate and initialize a btree cursor for ialloc btree. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 6a51619d8690..c31d2c2eadc4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -384,4 +384,10 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #endif /* XFS_WARN */ #endif /* DEBUG */ +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fe88ef67f93a..e408bf5a3ff7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_trans.h" @@ -1031,7 +1029,7 @@ xfs_log_need_covered(xfs_mount_t *mp) struct xlog *log = mp->m_log; int needed = 0; - if (!xfs_fs_writable(mp)) + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) return 0; if (!xlog_cil_empty(log)) @@ -1808,6 +1806,8 @@ xlog_sync( XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; + /* use high priority completion wq */ + bp->b_ioend_wq = log->l_mp->m_log_workqueue; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { bp->b_flags |= XBF_FUA; @@ -1856,6 +1856,8 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; + /* use high priority completion wq */ + bp->b_ioend_wq = log->l_mp->m_log_workqueue; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f506c457011e..45cc0ce18adf 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -17,11 +17,10 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 00cd7f3a8f59..a5a945fc3bdc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -22,11 +22,10 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_log.h" @@ -42,7 +41,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_error.h" #include "xfs_dir2.h" diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 63ca2f0420b1..d8b67547ab34 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,10 +17,9 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 51435dbce9c4..d3d38836f87f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,11 +22,10 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_ialloc.h" @@ -41,7 +40,6 @@ #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_sysfs.h" @@ -1074,11 +1072,23 @@ xfs_unmountfs( xfs_sysfs_del(&mp->m_kobj); } -int -xfs_fs_writable(xfs_mount_t *mp) +/* + * Determine whether modifications can proceed. The caller specifies the minimum + * freeze level for which modifications should not be allowed. This allows + * certain operations to proceed while the freeze sequence is in progress, if + * necessary. + */ +bool +xfs_fs_writable( + struct xfs_mount *mp, + int level) { - return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || - (mp->m_flags & XFS_MOUNT_RDONLY)); + ASSERT(level > SB_UNFROZEN); + if ((mp->m_super->s_writers.frozen >= level) || + XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + return false; + + return true; } /* @@ -1086,9 +1096,9 @@ xfs_fs_writable(xfs_mount_t *mp) * * Sync the superblock counters to disk. * - * Note this code can be called during the process of freezing, so - * we may need to use the transaction allocator which does not - * block when the transaction subsystem is in its frozen state. + * Note this code can be called during the process of freezing, so we use the + * transaction allocator that does not block when the transaction subsystem is + * in its frozen state. */ int xfs_log_sbcount(xfs_mount_t *mp) @@ -1096,7 +1106,8 @@ xfs_log_sbcount(xfs_mount_t *mp) xfs_trans_t *tp; int error; - if (!xfs_fs_writable(mp)) + /* allow this to proceed during the freeze sequence... */ + if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; xfs_icsb_sync_counters(mp, 0); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b0447c86e7e2..22ccf69d4d3c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -168,6 +168,7 @@ typedef struct xfs_mount { /* low free space thresholds */ struct xfs_kobj m_kobj; + struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; @@ -320,10 +321,7 @@ typedef struct xfs_mod_sb { /* * Per-ag incore structure, copies of information in agf and agi, to improve the - * performance of allocation group selection. This is defined for the kernel - * only, and hence is defined here instead of in xfs_ag.h. You need the struct - * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree), - * so this doesn't introduce any strange header file dependencies. + * performance of allocation group selection. */ typedef struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ @@ -384,7 +382,7 @@ extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -extern int xfs_fs_writable(xfs_mount_t *); +extern bool xfs_fs_writable(struct xfs_mount *mp, int level); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d68f23021af3..79fb19dd9c83 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -38,7 +37,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * The global quota manager. There is only one of these for the entire @@ -1749,23 +1747,21 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; - else if (uq) + else xfs_qm_dqrele(uq); if (O_gdqpp) *O_gdqpp = gq; - else if (gq) + else xfs_qm_dqrele(gq); if (O_pdqpp) *O_pdqpp = pq; - else if (pq) + else xfs_qm_dqrele(pq); return 0; error_rele: - if (gq) - xfs_qm_dqrele(gq); - if (uq) - xfs_qm_dqrele(uq); + xfs_qm_dqrele(gq); + xfs_qm_dqrele(uq); return error; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 2c61e61b0205..3e52d5de7ae1 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 80f2d77d929a..74fca68e43b6 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -26,7 +26,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -784,19 +783,21 @@ xfs_qm_log_quotaoff( { xfs_trans_t *tp; int error; - xfs_qoff_logitem_t *qoffi=NULL; - uint oldsbqflag=0; + xfs_qoff_logitem_t *qoffi; + + *qoffstartp = NULL; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); spin_lock(&mp->m_sb_lock); - oldsbqflag = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; spin_unlock(&mp->m_sb_lock); @@ -809,19 +810,11 @@ xfs_qm_log_quotaoff( */ xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); + if (error) + goto out; -error0: - if (error) { - xfs_trans_cancel(tp, 0); - /* - * No one else is modifying sb_qflags, so this is OK. - * We still hold the quotaofflock. - */ - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = oldsbqflag; - spin_unlock(&mp->m_sb_lock); - } *qoffstartp = qoffi; +out: return error; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index b238027df987..7542bbeca6a1 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -19,8 +19,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e1175ea9b551..f2079b6911cc 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" @@ -36,7 +34,6 @@ #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 206b97fd1d8a..19cbda196369 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -21,9 +21,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -44,7 +42,6 @@ #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_icreate_item.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_quota.h" #include "xfs_sysfs.h" @@ -796,8 +793,7 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: - if (rtdev) - xfs_blkdev_put(rtdev); + xfs_blkdev_put(rtdev); out_close_logdev: if (logdev && logdev != ddev) xfs_blkdev_put(logdev); @@ -842,10 +838,15 @@ STATIC int xfs_init_mount_workqueues( struct xfs_mount *mp) { + mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + if (!mp->m_buf_workqueue) + goto out; + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_data_workqueue) - goto out; + goto out_destroy_buf; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); @@ -863,7 +864,7 @@ xfs_init_mount_workqueues( goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; @@ -884,6 +885,8 @@ out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); +out_destroy_buf: + destroy_workqueue(mp->m_buf_workqueue); out: return -ENOMEM; } @@ -898,6 +901,7 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); + destroy_workqueue(mp->m_buf_workqueue); } /* @@ -1000,7 +1004,6 @@ xfs_fs_evict_inode( clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); - XFS_STATS_DEC(vn_active); xfs_inactive(ip); } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 02ae62a998e0..25791df6f638 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -42,7 +40,6 @@ #include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_log.h" -#include "xfs_dinode.h" /* ----- Kernel only functions below ----- */ STATIC int diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 1e85bcd0e418..13a029806805 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 30e8e3410955..fa3135b9bf04 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_extent_busy.h" diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 859482f53b5a..573aefb5a573 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -18,10 +18,9 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index e2b2216b1635..0a4d4ab6d9a9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -229,13 +227,6 @@ xfs_trans_getsb(xfs_trans_t *tp, return bp; } -#ifdef DEBUG -xfs_buftarg_t *xfs_error_target; -int xfs_do_error; -int xfs_req_num; -int xfs_error_mod = 33; -#endif - /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been @@ -257,46 +248,11 @@ xfs_trans_read_buf_map( struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf *bp = NULL; + struct xfs_buf_log_item *bip; int error; *bpp = NULL; - if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (!bp) - return (flags & XBF_TRYLOCK) ? - -EAGAIN : -ENOMEM; - - if (bp->b_error) { - error = bp->b_error; - xfs_buf_ioerror_alert(bp, __func__); - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - return error; - } -#ifdef DEBUG - if (xfs_do_error) { - if (xfs_error_target == target) { - if (((xfs_req_num++) % xfs_error_mod) == 0) { - xfs_buf_relse(bp); - xfs_debug(mp, "Returning error!"); - return -EIO; - } - } - } -#endif - if (XFS_FORCED_SHUTDOWN(mp)) - goto shutdown_abort; - *bpp = bp; - return 0; - } - /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already @@ -305,49 +261,24 @@ xfs_trans_read_buf_map( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - bp = xfs_trans_buf_item_match(tp, target, map, nmaps); - if (bp != NULL) { + if (tp) + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_fspriv != NULL); ASSERT(!bp->b_error); - if (!(XFS_BUF_ISDONE(bp))) { - trace_xfs_trans_read_buf_io(bp, _RET_IP_); - ASSERT(!XFS_BUF_ISASYNC(bp)); - ASSERT(bp->b_iodone == NULL); - XFS_BUF_READ(bp); - bp->b_ops = ops; - - error = xfs_buf_submit_wait(bp); - if (error) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - /* - * We can gracefully recover from most read - * errors. Ones we can't are those that happen - * after the transaction's already dirty. - */ - if (tp->t_flags & XFS_TRANS_DIRTY) - xfs_force_shutdown(tp->t_mountp, - SHUTDOWN_META_IO_ERROR); - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - return error; - } - } + ASSERT(bp->b_flags & XBF_DONE); + /* * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ if (XFS_FORCED_SHUTDOWN(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); - *bpp = NULL; return -EIO; } - bip = bp->b_fspriv; bip->bli_recur++; @@ -358,17 +289,29 @@ xfs_trans_read_buf_map( } bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (bp == NULL) { - *bpp = NULL; - return (flags & XBF_TRYLOCK) ? - 0 : -ENOMEM; + if (!bp) { + if (!(flags & XBF_TRYLOCK)) + return -ENOMEM; + return tp ? 0 : -EAGAIN; } + + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ if (bp->b_error) { error = bp->b_error; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); + bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); - XFS_BUF_DONE(bp); - xfs_buf_ioerror_alert(bp, __func__); - if (tp->t_flags & XFS_TRANS_DIRTY) + + if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); @@ -377,33 +320,19 @@ xfs_trans_read_buf_map( error = -EFSCORRUPTED; return error; } -#ifdef DEBUG - if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) { - if (xfs_error_target == target) { - if (((xfs_req_num++) % xfs_error_mod) == 0) { - xfs_force_shutdown(tp->t_mountp, - SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - xfs_debug(mp, "Returning trans error!"); - return -EIO; - } - } + + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_relse(bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; } -#endif - if (XFS_FORCED_SHUTDOWN(mp)) - goto shutdown_abort; - _xfs_trans_bjoin(tp, bp, 1); + if (tp) + _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_fspriv); - *bpp = bp; return 0; -shutdown_abort: - trace_xfs_trans_read_buf_shut(bp, _RET_IP_); - xfs_buf_relse(bp); - *bpp = NULL; - return -EIO; } /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 846e061c2e98..76a16df55ef7 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 47978ba89dae..284397dd7990 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,10 +18,9 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index cdb4d86520e1..17280cd71934 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 93455b998041..69f6e475de97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" |