summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/dma-buf-sharing.txt13
-rw-r--r--Documentation/filesystems/ext4.txt21
-rw-r--r--arch/hexagon/Kconfig33
-rw-r--r--arch/hexagon/Makefile17
-rw-r--r--arch/hexagon/include/asm/Kbuild2
-rw-r--r--arch/hexagon/include/asm/atomic.h22
-rw-r--r--arch/hexagon/include/asm/elf.h23
-rw-r--r--arch/hexagon/include/asm/hexagon_vm.h56
-rw-r--r--arch/hexagon/include/asm/io.h16
-rw-r--r--arch/hexagon/include/asm/mem-layout.h28
-rw-r--r--arch/hexagon/include/asm/page.h12
-rw-r--r--arch/hexagon/include/asm/processor.h51
-rw-r--r--arch/hexagon/include/asm/vm_mmu.h11
-rw-r--r--arch/hexagon/include/uapi/asm/ptrace.h5
-rw-r--r--arch/hexagon/include/uapi/asm/registers.h17
-rw-r--r--arch/hexagon/include/uapi/asm/signal.h4
-rw-r--r--arch/hexagon/include/uapi/asm/unistd.h3
-rw-r--r--arch/hexagon/include/uapi/asm/user.h6
-rw-r--r--arch/hexagon/kernel/Makefile2
-rw-r--r--arch/hexagon/kernel/asm-offsets.c5
-rw-r--r--arch/hexagon/kernel/dma.c27
-rw-r--r--arch/hexagon/kernel/head.S107
-rw-r--r--arch/hexagon/kernel/kgdb.c4
-rw-r--r--arch/hexagon/kernel/process.c42
-rw-r--r--arch/hexagon/kernel/ptrace.c26
-rw-r--r--arch/hexagon/kernel/setup.c9
-rw-r--r--arch/hexagon/kernel/signal.c45
-rw-r--r--arch/hexagon/kernel/topology.c52
-rw-r--r--arch/hexagon/kernel/traps.c36
-rw-r--r--arch/hexagon/kernel/vm_entry.S282
-rw-r--r--arch/hexagon/kernel/vm_events.c4
-rw-r--r--arch/hexagon/kernel/vm_vectors.S4
-rw-r--r--arch/hexagon/kernel/vmlinux.lds.S12
-rw-r--r--arch/hexagon/mm/init.c35
-rw-r--r--arch/hexagon/mm/vm_fault.c4
-rw-r--r--arch/openrisc/Kconfig3
-rw-r--r--drivers/base/dma-buf.c169
-rw-r--r--drivers/tty/pty.c3
-rw-r--r--drivers/tty/tty_io.c4
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/balloc.c53
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h101
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/ext4_jbd2.h12
-rw-r--r--fs/ext4/extents.c522
-rw-r--r--fs/ext4/fsync.c3
-rw-r--r--fs/ext4/ialloc.c88
-rw-r--r--fs/ext4/indirect.c473
-rw-r--r--fs/ext4/inline.c178
-rw-r--r--fs/ext4/inode.c580
-rw-r--r--fs/ext4/ioctl.c218
-rw-r--r--fs/ext4/mballoc.c253
-rw-r--r--fs/ext4/migrate.c62
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c73
-rw-r--r--fs/ext4/namei.c48
-rw-r--r--fs/ext4/page-io.c280
-rw-r--r--fs/ext4/resize.c16
-rw-r--r--fs/ext4/super.c131
-rw-r--r--fs/ext4/xattr.c13
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/jbd2/commit.c50
-rw-r--r--fs/jbd2/journal.c31
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--include/linux/buffer_head.h4
-rw-r--r--include/linux/dma-buf.h16
-rw-r--r--include/linux/jbd2.h4
-rw-r--r--include/linux/journal-head.h11
-rw-r--r--include/trace/events/ext4.h16
-rw-r--r--include/trace/events/jbd2.h21
73 files changed, 2753 insertions, 1776 deletions
diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
index 4966b1be42ac..0b23261561d2 100644
--- a/Documentation/dma-buf-sharing.txt
+++ b/Documentation/dma-buf-sharing.txt
@@ -52,14 +52,23 @@ The dma_buf buffer sharing API usage contains the following steps:
associated with this buffer.
Interface:
- struct dma_buf *dma_buf_export(void *priv, struct dma_buf_ops *ops,
- size_t size, int flags)
+ struct dma_buf *dma_buf_export_named(void *priv, struct dma_buf_ops *ops,
+ size_t size, int flags,
+ const char *exp_name)
If this succeeds, dma_buf_export allocates a dma_buf structure, and returns a
pointer to the same. It also associates an anonymous file with this buffer,
so it can be exported. On failure to allocate the dma_buf object, it returns
NULL.
+ 'exp_name' is the name of exporter - to facilitate information while
+ debugging.
+
+ Exporting modules which do not wish to provide any specific name may use the
+ helper define 'dma_buf_export()', with the same arguments as above, but
+ without the last argument; a __FILE__ pre-processor directive will be
+ inserted in place of 'exp_name' instead.
+
2. Userspace gets a handle to pass around to potential buffer-users
Userspace entity requests for a file-descriptor (fd) which is a handle to the
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 34ea4f1fa6ea..f7cbf574a875 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
session_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
+
+ reserved_clusters This is RW file and contains number of reserved
+ clusters in the file system which will be used
+ in the specific situations to avoid costly
+ zeroout, unexpected ENOSPC, or possible data
+ loss. The default is 2% or 4096 clusters,
+ whichever is smaller and this can be changed
+ however it can never exceed number of clusters
+ in the file system. If there is not enough space
+ for the reserved space when mounting the file
+ mount will _not_ fail.
..............................................................................
Ioctls
@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
bitmaps and inode table, the userspace tool thus
just passes the new number of blocks.
+EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
+ (like i_blocks, i_size, i_flags, ...) from
+ the specified inode with inode
+ EXT4_BOOT_LOADER_INO (#5). This is typically
+ used to store a boot loader in a secure part of
+ the filesystem, where it can't be changed by a
+ normal user by accident.
+ The data blocks of the previous boot loader
+ will be associated with the given inode.
+
..............................................................................
References
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index e4decc6b8947..04dff5bdcbf7 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -29,21 +29,17 @@ config HEXAGON
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST
select MODULES_USE_ELF_RELA
+ select GENERIC_CPU_DEVICES
+ select GENERIC_KERNEL_THREAD
+ select GENERIC_KERNEL_EXECVE
---help---
Qualcomm Hexagon is a processor architecture designed for high
performance and low power across a wide variety of applications.
-config HEXAGON_ARCH_V1
- bool
-
-config HEXAGON_ARCH_V2
- bool
-
-config HEXAGON_ARCH_V3
- bool
-
-config HEXAGON_ARCH_V4
- bool
+config HEXAGON_PHYS_OFFSET
+ def_bool y
+ ---help---
+ Platforms that don't load the kernel at zero set this.
config FRAME_POINTER
def_bool y
@@ -81,9 +77,6 @@ config RWSEM_GENERIC_SPINLOCK
config RWSEM_XCHGADD_ALGORITHM
def_bool y
-config GENERIC_FIND_NEXT_BIT
- def_bool y
-
config GENERIC_HWEIGHT
def_bool y
@@ -103,14 +96,14 @@ choice
config HEXAGON_COMET
bool "Comet Board"
- select HEXAGON_ARCH_V2
---help---
Support for the Comet platform.
endchoice
-config HEXAGON_VM
- def_bool y
+config HEXAGON_ARCH_VERSION
+ int "Architecture version"
+ default 2
config CMDLINE
string "Default kernel command string"
@@ -122,12 +115,6 @@ config CMDLINE
minimum, you should specify the memory size and the root device
(e.g., mem=64M root=/dev/nfs).
-config HEXAGON_ANGEL_TRAPS
- bool "Use Angel Traps"
- default n
- ---help---
- Enable angel debug traps (for printk's).
-
config SMP
bool "Multi-Processing support"
---help---
diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile
index d00d900b2566..207711a0fd0c 100644
--- a/arch/hexagon/Makefile
+++ b/arch/hexagon/Makefile
@@ -15,20 +15,9 @@ KBUILD_CFLAGS += -fno-short-enums
# LDFLAGS_MODULE += -shared
CFLAGS_MODULE += -mlong-calls
-cflags-$(CONFIG_HEXAGON_ARCH_V1) += $(call cc-option,-mv1)
-cflags-$(CONFIG_HEXAGON_ARCH_V2) += $(call cc-option,-mv2)
-cflags-$(CONFIG_HEXAGON_ARCH_V3) += $(call cc-option,-mv3)
-cflags-$(CONFIG_HEXAGON_ARCH_V4) += $(call cc-option,-mv4)
-
-aflags-$(CONFIG_HEXAGON_ARCH_V1) += $(call cc-option,-mv1)
-aflags-$(CONFIG_HEXAGON_ARCH_V2) += $(call cc-option,-mv2)
-aflags-$(CONFIG_HEXAGON_ARCH_V3) += $(call cc-option,-mv3)
-aflags-$(CONFIG_HEXAGON_ARCH_V4) += $(call cc-option,-mv4)
-
-ldflags-$(CONFIG_HEXAGON_ARCH_V1) += $(call cc-option,-mv1)
-ldflags-$(CONFIG_HEXAGON_ARCH_V2) += $(call cc-option,-mv2)
-ldflags-$(CONFIG_HEXAGON_ARCH_V3) += $(call cc-option,-mv3)
-ldflags-$(CONFIG_HEXAGON_ARCH_V4) += $(call cc-option,-mv4)
+cflags-y += $(call cc-option,-mv${CONFIG_HEXAGON_ARCH_VERSION})
+aflags-y += $(call cc-option,-mv${CONFIG_HEXAGON_ARCH_VERSION})
+ldflags-y += $(call cc-option,-mv${CONFIG_HEXAGON_ARCH_VERSION})
KBUILD_CFLAGS += $(cflags-y)
KBUILD_AFLAGS += $(aflags-y)
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index bdb54ceb53bc..1da17caac23c 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -25,7 +25,6 @@ generic-y += kdebug.h
generic-y += kmap_types.h
generic-y += local64.h
generic-y += local.h
-generic-y += local.h
generic-y += mman.h
generic-y += msgbuf.h
generic-y += pci.h
@@ -41,6 +40,7 @@ generic-y += sembuf.h
generic-y += shmbuf.h
generic-y += shmparam.h
generic-y += siginfo.h
+generic-y += sizes.h
generic-y += socket.h
generic-y += sockios.h
generic-y += statfs.h
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 468fbb0781cd..8a64ff2337f6 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -1,7 +1,7 @@
/*
* Atomic operations for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
*
* This program is free software; you can redistribute it and/or modify
@@ -117,35 +117,37 @@ static inline int atomic_sub_return(int i, atomic_t *v)
#define atomic_sub(i, v) atomic_sub_return(i, (v))
/**
- * atomic_add_unless - add unless the number is a given value
+ * __atomic_add_unless - add unless the number is a given value
* @v: pointer to value
* @a: amount to add
* @u: unless value is equal to u
*
- * Returns 1 if the add happened, 0 if it didn't.
+ * Returns old value.
+ *
*/
+
static inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
- int output, __oldval;
+ int __oldval;
+ register int tmp;
+
asm volatile(
"1: %0 = memw_locked(%2);"
" {"
" p3 = cmp.eq(%0, %4);"
" if (p3.new) jump:nt 2f;"
- " %0 = add(%0, %3);"
- " %1 = #0;"
+ " %1 = add(%0, %3);"
" }"
- " memw_locked(%2, p3) = %0;"
+ " memw_locked(%2, p3) = %1;"
" {"
" if !p3 jump 1b;"
- " %1 = #1;"
" }"
"2:"
- : "=&r" (__oldval), "=&r" (output)
+ : "=&r" (__oldval), "=&r" (tmp)
: "r" (v), "r" (a), "r" (u)
: "memory", "p3"
);
- return output;
+ return __oldval;
}
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
diff --git a/arch/hexagon/include/asm/elf.h b/arch/hexagon/include/asm/elf.h
index 1f14e082588e..e1b933a0e121 100644
--- a/arch/hexagon/include/asm/elf.h
+++ b/arch/hexagon/include/asm/elf.h
@@ -1,7 +1,7 @@
/*
* ELF definitions for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -104,6 +104,16 @@ typedef unsigned long elf_fpregset_t;
* Bypass the whole "regsets" thing for now and use the define.
*/
+#if CONFIG_HEXAGON_ARCH_VERSION >= 4
+#define CS_COPYREGS(DEST,REGS) \
+do {\
+ DEST.cs0 = REGS->cs0;\
+ DEST.cs1 = REGS->cs1;\
+} while (0)
+#else
+#define CS_COPYREGS(DEST,REGS)
+#endif
+
#define ELF_CORE_COPY_REGS(DEST, REGS) \
do { \
DEST.r0 = REGS->r00; \
@@ -148,13 +158,12 @@ do { \
DEST.p3_0 = REGS->preds; \
DEST.gp = REGS->gp; \
DEST.ugp = REGS->ugp; \
- DEST.pc = pt_elr(REGS); \
+ CS_COPYREGS(DEST,REGS); \
+ DEST.pc = pt_elr(REGS); \
DEST.cause = pt_cause(REGS); \
DEST.badva = pt_badva(REGS); \
} while (0);
-
-
/*
* This is used to ensure we don't load something for the wrong architecture.
* Checks the machine and ABI type.
@@ -168,15 +177,15 @@ do { \
#define ELF_DATA ELFDATA2LSB
#define ELF_ARCH EM_HEXAGON
-#ifdef CONFIG_HEXAGON_ARCH_V2
+#if CONFIG_HEXAGON_ARCH_VERSION == 2
#define ELF_CORE_EFLAGS 0x1
#endif
-#ifdef CONFIG_HEXAGON_ARCH_V3
+#if CONFIG_HEXAGON_ARCH_VERSION == 3
#define ELF_CORE_EFLAGS 0x2
#endif
-#ifdef CONFIG_HEXAGON_ARCH_V4
+#if CONFIG_HEXAGON_ARCH_VERSION == 4
#define ELF_CORE_EFLAGS 0x3
#endif
diff --git a/arch/hexagon/include/asm/hexagon_vm.h b/arch/hexagon/include/asm/hexagon_vm.h
index c144bee6cabe..67bb6d6f3337 100644
--- a/arch/hexagon/include/asm/hexagon_vm.h
+++ b/arch/hexagon/include/asm/hexagon_vm.h
@@ -1,7 +1,7 @@
/*
* Declarations for to Hexagon Virtal Machine.
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -31,10 +31,26 @@
* for tracing/debugging.
*/
-/*
- * Lets make this stuff visible only if configured,
- * so we can unconditionally include the file.
- */
+#define HVM_TRAP1_VMVERSION 0
+#define HVM_TRAP1_VMRTE 1
+#define HVM_TRAP1_VMSETVEC 2
+#define HVM_TRAP1_VMSETIE 3
+#define HVM_TRAP1_VMGETIE 4
+#define HVM_TRAP1_VMINTOP 5
+#define HVM_TRAP1_VMCLRMAP 10
+#define HVM_TRAP1_VMNEWMAP 11
+#define HVM_TRAP1_FORMERLY_VMWIRE 12
+#define HVM_TRAP1_VMCACHE 13
+#define HVM_TRAP1_VMGETTIME 14
+#define HVM_TRAP1_VMSETTIME 15
+#define HVM_TRAP1_VMWAIT 16
+#define HVM_TRAP1_VMYIELD 17
+#define HVM_TRAP1_VMSTART 18
+#define HVM_TRAP1_VMSTOP 19
+#define HVM_TRAP1_VMVPID 20
+#define HVM_TRAP1_VMSETREGS 21
+#define HVM_TRAP1_VMGETREGS 22
+#define HVM_TRAP1_VMTIMEROP 24
#ifndef __ASSEMBLY__
@@ -175,31 +191,19 @@ static inline long __vmintop_clear(long i)
#else /* Only assembly code should reference these */
-#define HVM_TRAP1_VMRTE 1
-#define HVM_TRAP1_VMSETVEC 2
-#define HVM_TRAP1_VMSETIE 3
-#define HVM_TRAP1_VMGETIE 4
-#define HVM_TRAP1_VMINTOP 5
-#define HVM_TRAP1_VMCLRMAP 10
-#define HVM_TRAP1_VMNEWMAP 11
-#define HVM_TRAP1_FORMERLY_VMWIRE 12
-#define HVM_TRAP1_VMCACHE 13
-#define HVM_TRAP1_VMGETTIME 14
-#define HVM_TRAP1_VMSETTIME 15
-#define HVM_TRAP1_VMWAIT 16
-#define HVM_TRAP1_VMYIELD 17
-#define HVM_TRAP1_VMSTART 18
-#define HVM_TRAP1_VMSTOP 19
-#define HVM_TRAP1_VMVPID 20
-#define HVM_TRAP1_VMSETREGS 21
-#define HVM_TRAP1_VMGETREGS 22
-
#endif /* __ASSEMBLY__ */
/*
* Constants for virtual instruction parameters and return values
*/
+/* vmnewmap arguments */
+
+#define VM_TRANS_TYPE_LINEAR 0
+#define VM_TRANS_TYPE_TABLE 1
+#define VM_TLB_INVALIDATE_FALSE 0
+#define VM_TLB_INVALIDATE_TRUE 1
+
/* vmsetie arguments */
#define VM_INT_DISABLE 0
@@ -224,6 +228,8 @@ static inline long __vmintop_clear(long i)
#define HVM_VMEST_UM_MSK 1
#define HVM_VMEST_IE_SFT 30
#define HVM_VMEST_IE_MSK 1
+#define HVM_VMEST_SS_SFT 29
+#define HVM_VMEST_SS_MSK 1
#define HVM_VMEST_EVENTNUM_SFT 16
#define HVM_VMEST_EVENTNUM_MSK 0xff
#define HVM_VMEST_CAUSE_SFT 0
@@ -260,6 +266,8 @@ static inline long __vmintop_clear(long i)
#define HVM_GE_C_INVI 0x15
#define HVM_GE_C_PRIVI 0x1B
#define HVM_GE_C_XMAL 0x1C
+#define HVM_GE_C_WREG 0x1D
+#define HVM_GE_C_PCAL 0x1E
#define HVM_GE_C_RMAL 0x20
#define HVM_GE_C_WMAL 0x21
#define HVM_GE_C_RPROT 0x22
diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h
index e527cfeff5ba..1b7698e19139 100644
--- a/arch/hexagon/include/asm/io.h
+++ b/arch/hexagon/include/asm/io.h
@@ -1,7 +1,7 @@
/*
* IO definitions for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -40,6 +40,8 @@
#define IO_SPACE_LIMIT 0xffff
#define _IO_BASE ((void __iomem *)0xfe000000)
+#define IOMEM(x) ((void __force __iomem *)(x))
+
extern int remap_area_pages(unsigned long start, unsigned long phys_addr,
unsigned long end, unsigned long flags);
@@ -176,6 +178,18 @@ static inline void writel(u32 data, volatile void __iomem *addr)
#define __raw_readl readl
/*
+ * http://comments.gmane.org/gmane.linux.ports.arm.kernel/117626
+ */
+
+#define readb_relaxed __raw_readb
+#define readw_relaxed __raw_readw
+#define readl_relaxed __raw_readl
+
+#define writeb_relaxed __raw_writeb
+#define writew_relaxed __raw_writew
+#define writel_relaxed __raw_writel
+
+/*
* Need an mtype somewhere in here, for cache type deals?
* This is probably too long for an inline.
*/
diff --git a/arch/hexagon/include/asm/mem-layout.h b/arch/hexagon/include/asm/mem-layout.h
index af16e977c55e..60556f8c45d8 100644
--- a/arch/hexagon/include/asm/mem-layout.h
+++ b/arch/hexagon/include/asm/mem-layout.h
@@ -1,7 +1,7 @@
/*
* Memory layout definitions for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -32,16 +32,25 @@
#define PAGE_OFFSET _AC(0xc0000000, UL)
/*
- * LOAD_ADDRESS is the physical/linear address of where in memory
- * the kernel gets loaded. The 12 least significant bits must be zero (0)
- * due to limitations on setting the EVB
- *
+ * Compiling for a platform that needs a crazy physical offset
+ * (like if the memory starts at 1GB and up) means we need
+ * an actual PHYS_OFFSET. Should be set up in head.S.
*/
-#ifndef LOAD_ADDRESS
-#define LOAD_ADDRESS 0x00000000
+#ifdef CONFIG_HEXAGON_PHYS_OFFSET
+#ifndef __ASSEMBLY__
+extern unsigned long __phys_offset;
+#endif
+#define PHYS_OFFSET __phys_offset
+#endif
+
+#ifndef PHYS_OFFSET
+#define PHYS_OFFSET 0
#endif
+#define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT)
+#define ARCH_PFN_OFFSET PHYS_PFN_OFFSET
+
#define TASK_SIZE (PAGE_OFFSET)
/* not sure how these are used yet */
@@ -55,7 +64,7 @@ enum fixed_addresses {
__end_of_fixed_addresses
};
-#define MIN_KERNEL_SEG 0x300 /* From 0xc0000000 */
+#define MIN_KERNEL_SEG (PAGE_OFFSET >> PGDIR_SHIFT) /* L1 shift is 22 bits */
extern int max_kernel_seg;
/*
@@ -63,8 +72,7 @@ extern int max_kernel_seg;
* supposed to be based on the amount of physical memory available
*/
-#define VMALLOC_START (PAGE_OFFSET + VMALLOC_OFFSET + \
- (unsigned long)high_memory)
+#define VMALLOC_START ((unsigned long) __va(high_memory + VMALLOC_OFFSET))
/* Gap between physical ram and vmalloc space for guard purposes. */
#define VMALLOC_OFFSET PAGE_SIZE
diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h
index 692adc213429..93f5669b4aa1 100644
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -1,7 +1,7 @@
/*
* Page management definitions for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -96,8 +96,8 @@ typedef struct page *pgtable_t;
* MIPS says they're only used during mem_init.
* also, check if we need a PHYS_OFFSET.
*/
-#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
-#define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET))
+#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET + PHYS_OFFSET)
+#define __va(x) ((void *)((unsigned long)(x) - PHYS_OFFSET + PAGE_OFFSET))
/* The "page frame" descriptor is defined in linux/mm.h */
struct page;
@@ -140,6 +140,11 @@ static inline void clear_page(void *page)
*/
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
+#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+
+#define page_to_virt(page) __va(page_to_phys(page))
+
/*
* For port to Hexagon Virtual Machine, MAYBE we check for attempts
* to reference reserved HVM space, but in any case, the VM will be
@@ -147,6 +152,7 @@ static inline void clear_page(void *page)
*/
#define kern_addr_valid(addr) (1)
+#include <asm/mem-layout.h>
#include <asm-generic/memory_model.h>
/* XXX Todo: implement assembly-optimized version of getorder. */
#include <asm-generic/getorder.h>
diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h
index 6dd5d3706869..45a825402f63 100644
--- a/arch/hexagon/include/asm/processor.h
+++ b/arch/hexagon/include/asm/processor.h
@@ -1,7 +1,7 @@
/*
* Process/processor support for the Hexagon architecture
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -100,12 +100,49 @@ extern unsigned long get_wchan(struct task_struct *p);
*/
struct hexagon_switch_stack {
- unsigned long long r1716;
- unsigned long long r1918;
- unsigned long long r2120;
- unsigned long long r2322;
- unsigned long long r2524;
- unsigned long long r2726;
+ union {
+ struct {
+ unsigned long r16;
+ unsigned long r17;
+ };
+ unsigned long long r1716;
+ };
+ union {
+ struct {
+ unsigned long r18;
+ unsigned long r19;
+ };
+ unsigned long long r1918;
+ };
+ union {
+ struct {
+ unsigned long r20;
+ unsigned long r21;
+ };
+ unsigned long long r2120;
+ };
+ union {
+ struct {
+ unsigned long r22;
+ unsigned long r23;
+ };
+ unsigned long long r2322;
+ };
+ union {
+ struct {
+ unsigned long r24;
+ unsigned long r25;
+ };
+ unsigned long long r2524;
+ };
+ union {
+ struct {
+ unsigned long r26;
+ unsigned long r27;
+ };
+ unsigned long long r2726;
+ };
+
unsigned long fp;
unsigned long lr;
};
diff --git a/arch/hexagon/include/asm/vm_mmu.h b/arch/hexagon/include/asm/vm_mmu.h
index 9a94de7969bb..096537d8f4c5 100644
--- a/arch/hexagon/include/asm/vm_mmu.h
+++ b/arch/hexagon/include/asm/vm_mmu.h
@@ -1,7 +1,7 @@
/*
* Hexagon VM page table entry definitions
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2011,2013 The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -68,14 +68,13 @@
#define __HEXAGON_C_WB 0x0 /* Write-back, no L2 */
#define __HEXAGON_C_WT 0x1 /* Write-through, no L2 */
+#define __HEXAGON_C_UNC 0x6 /* Uncached memory */
+#if CONFIG_HEXAGON_ARCH_VERSION >= 2
#define __HEXAGON_C_DEV 0x4 /* Device register space */
-#define __HEXAGON_C_WT_L2 0x5 /* Write-through, with L2 */
-/* this really should be #if CONFIG_HEXAGON_ARCH = 2 but that's not defined */
-#if defined(CONFIG_HEXAGON_COMET) || defined(CONFIG_QDSP6_ST1)
-#define __HEXAGON_C_UNC __HEXAGON_C_DEV
#else
-#define __HEXAGON_C_UNC 0x6 /* Uncached memory */
+#define __HEXAGON_C_DEV __HEXAGON_C_UNC
#endif
+#define __HEXAGON_C_WT_L2 0x5 /* Write-through, with L2 */
#define __HEXAGON_C_WB_L2 0x7 /* Write-back, with L2 */
/*
diff --git a/arch/hexagon/include/uapi/asm/ptrace.h b/arch/hexagon/include/uapi/asm/ptrace.h
index 1ffce0c6ee07..065e5b32313f 100644
--- a/arch/hexagon/include/uapi/asm/ptrace.h
+++ b/arch/hexagon/include/uapi/asm/ptrace.h
@@ -36,4 +36,9 @@ extern const char *regs_query_register_name(unsigned int offset);
((struct pt_regs *) \
((unsigned long)current_thread_info() + THREAD_SIZE) - 1)
+#if CONFIG_HEXAGON_ARCH_VERSION >= 4
+#define arch_has_single_step() (1)
+#endif
+
+
#endif
diff --git a/arch/hexagon/include/uapi/asm/registers.h b/arch/hexagon/include/uapi/asm/registers.h
index c20406f63b5c..487d6ceca5e7 100644
--- a/arch/hexagon/include/uapi/asm/registers.h
+++ b/arch/hexagon/include/uapi/asm/registers.h
@@ -57,10 +57,17 @@ struct pt_regs {
};
union {
struct {
- unsigned long gp;
unsigned long ugp;
+ unsigned long gp;
};
- long long int ugpgp;
+ long long int gpugp;
+ };
+ union {
+ struct {
+ unsigned long cs0;
+ unsigned long cs1;
+ };
+ long long int cs1cs0;
};
/*
* Be extremely careful with rearranging these, if at all. Some code
@@ -204,9 +211,11 @@ struct pt_regs {
#define pt_psp(regs) ((regs)->hvmer.vmpsp)
#define pt_badva(regs) ((regs)->hvmer.vmbadva)
+#define pt_set_singlestep(regs) ((regs)->hvmer.vmest |= (1<<HVM_VMEST_SS_SFT))
+#define pt_clr_singlestep(regs) ((regs)->hvmer.vmest &= ~(1<<HVM_VMEST_SS_SFT))
+
#define pt_set_rte_sp(regs, sp) do {\
- pt_psp(regs) = (sp);\
- (regs)->SP = (unsigned long) &((regs)->hvmer);\
+ pt_psp(regs) = (regs)->SP = (sp);\
} while (0)
#define pt_set_kmode(regs) \
diff --git a/arch/hexagon/include/uapi/asm/signal.h b/arch/hexagon/include/uapi/asm/signal.h
index 939556817d34..98106e55ad4f 100644
--- a/arch/hexagon/include/uapi/asm/signal.h
+++ b/arch/hexagon/include/uapi/asm/signal.h
@@ -19,8 +19,12 @@
#ifndef _ASM_SIGNAL_H
#define _ASM_SIGNAL_H
+#include <uapi/asm/registers.h>
+
extern unsigned long __rt_sigtramp_template[2];
+void do_signal(struct pt_regs *regs);
+
#include <asm-generic/signal.h>
#endif
diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h
index 4a87cc47075c..ffee405d6803 100644
--- a/arch/hexagon/include/uapi/asm/unistd.h
+++ b/arch/hexagon/include/uapi/asm/unistd.h
@@ -27,6 +27,9 @@
*/
#define sys_mmap2 sys_mmap_pgoff
+#define __ARCH_WANT_SYS_EXECVE
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_VFORK
+#define __ARCH_WANT_SYS_FORK
#include <asm-generic/unistd.h>
diff --git a/arch/hexagon/include/uapi/asm/user.h b/arch/hexagon/include/uapi/asm/user.h
index cef13ee1413f..3dae94d9ced7 100644
--- a/arch/hexagon/include/uapi/asm/user.h
+++ b/arch/hexagon/include/uapi/asm/user.h
@@ -55,9 +55,15 @@ struct user_regs_struct {
unsigned long pc;
unsigned long cause;
unsigned long badva;
+#if CONFIG_HEXAGON_ARCH_VERSION < 4
unsigned long pad1; /* pad out to 48 words total */
unsigned long pad2; /* pad out to 48 words total */
unsigned long pad3; /* pad out to 48 words total */
+#else
+ unsigned long cs0;
+ unsigned long cs1;
+ unsigned long pad1; /* pad out to 48 words total */
+#endif
};
#endif
diff --git a/arch/hexagon/kernel/Makefile b/arch/hexagon/kernel/Makefile
index 6c19501b487c..29fc933a7722 100644
--- a/arch/hexagon/kernel/Makefile
+++ b/arch/hexagon/kernel/Makefile
@@ -1,6 +1,6 @@
extra-y := head.o vmlinux.lds
-obj-$(CONFIG_SMP) += smp.o topology.o
+obj-$(CONFIG_SMP) += smp.o
obj-y += setup.o irq_cpu.o traps.o syscalltab.o signal.o time.o
obj-y += process.o trampoline.o reset.o ptrace.o vdso.o
diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
index 2d5e84d3b00d..308be68d4fb3 100644
--- a/arch/hexagon/kernel/asm-offsets.c
+++ b/arch/hexagon/kernel/asm-offsets.c
@@ -5,7 +5,7 @@
* Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
* Copyright (C) 2000 MIPS Technologies, Inc.
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -44,7 +44,8 @@ int main(void)
COMMENT("Hexagon pt_regs definitions");
OFFSET(_PT_SYSCALL_NR, pt_regs, syscall_nr);
- OFFSET(_PT_UGPGP, pt_regs, ugpgp);
+ OFFSET(_PT_GPUGP, pt_regs, gpugp);
+ OFFSET(_PT_CS1CS0, pt_regs, cs1cs0);
OFFSET(_PT_R3130, pt_regs, r3130);
OFFSET(_PT_R2928, pt_regs, r2928);
OFFSET(_PT_R2726, pt_regs, r2726);
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index 65c7bdcf565e..b74f9bae31a3 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -1,7 +1,7 @@
/*
* DMA implementation for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -23,12 +23,18 @@
#include <linux/genalloc.h>
#include <asm/dma-mapping.h>
#include <linux/module.h>
+#include <asm/page.h>
struct dma_map_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
int bad_dma_address; /* globals are automatically initialized to zero */
+static inline void *dma_addr_to_virt(dma_addr_t dma_addr)
+{
+ return phys_to_virt((unsigned long) dma_addr);
+}
+
int dma_supported(struct device *dev, u64 mask)
{
if (mask == DMA_BIT_MASK(32))
@@ -60,6 +66,12 @@ static void *hexagon_dma_alloc_coherent(struct device *dev, size_t size,
{
void *ret;
+ /*
+ * Our max_low_pfn should have been backed off by 16MB in
+ * mm/init.c to create DMA coherent space. Use that as the VA
+ * for the pool.
+ */
+
if (coherent_pool == NULL) {
coherent_pool = gen_pool_create(PAGE_SHIFT, -1);
@@ -67,7 +79,7 @@ static void *hexagon_dma_alloc_coherent(struct device *dev, size_t size,
panic("Can't create %s() memory pool!", __func__);
else
gen_pool_add(coherent_pool,
- (PAGE_OFFSET + (max_low_pfn << PAGE_SHIFT)),
+ pfn_to_virt(max_low_pfn),
hexagon_coherent_pool_size, -1);
}
@@ -75,7 +87,7 @@ static void *hexagon_dma_alloc_coherent(struct device *dev, size_t size,
if (ret) {
memset(ret, 0, size);
- *dma_addr = (dma_addr_t) (ret - PAGE_OFFSET);
+ *dma_addr = (dma_addr_t) virt_to_phys(ret);
} else
*dma_addr = ~0;
@@ -118,8 +130,8 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg,
s->dma_length = s->length;
- flush_dcache_range(PAGE_OFFSET + s->dma_address,
- PAGE_OFFSET + s->dma_address + s->length);
+ flush_dcache_range(dma_addr_to_virt(s->dma_address),
+ dma_addr_to_virt(s->dma_address + s->length));
}
return nents;
@@ -149,11 +161,6 @@ static inline void dma_sync(void *addr, size_t size,
}
}
-static inline void *dma_addr_to_virt(dma_addr_t dma_addr)
-{
- return phys_to_virt((unsigned long) dma_addr);
-}
-
/**
* hexagon_map_page() - maps an address for device DMA
* @dev: pointer to DMA device
diff --git a/arch/hexagon/kernel/head.S b/arch/hexagon/kernel/head.S
index d859402c73ba..b9b63d085db2 100644
--- a/arch/hexagon/kernel/head.S
+++ b/arch/hexagon/kernel/head.S
@@ -1,7 +1,7 @@
/*
* Early kernel startup code for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
*
* This program is free software; you can redistribute it and/or modify
@@ -25,6 +25,9 @@
#include <asm/mem-layout.h>
#include <asm/vm_mmu.h>
#include <asm/page.h>
+#include <asm/hexagon_vm.h>
+
+#define SEGTABLE_ENTRIES #0x0e0
__INIT
ENTRY(stext)
@@ -43,40 +46,93 @@ ENTRY(stext)
* Symbol is kernel segment address, but we need
* the logical/physical address.
*/
- r24 = asl(r24, #2)
- r24 = lsr(r24, #2)
+ r25 = pc;
+ r2.h = #0xffc0;
+ r2.l = #0x0000;
+ r25 = and(r2,r25); /* R25 holds PHYS_OFFSET now */
+ r1.h = #HI(PAGE_OFFSET);
+ r1.l = #LO(PAGE_OFFSET);
+ r24 = sub(r24,r1); /* swapper_pg_dir - PAGE_OFFSET */
+ r24 = add(r24,r25); /* + PHYS_OFFSET */
- r0 = r24
+ r0 = r24; /* aka __pa(swapper_pg_dir) */
/*
- * Initialize a 16MB PTE to make the virtual and physical
+ * Initialize page dir to make the virtual and physical
* addresses where the kernel was loaded be identical.
+ * Done in 4MB chunks.
*/
#define PTE_BITS ( __HVM_PTE_R | __HVM_PTE_W | __HVM_PTE_X \
| __HEXAGON_C_WB_L2 << 6 \
| __HVM_PDE_S_4MB)
- r1 = pc
- r2.H = #0xffc0
- r2.L = #0x0000
- r1 = and(r1,r2) /* round PC to 4MB boundary */
+ /*
+ * Get number of VA=PA entries; only really needed for jump
+ * to hyperspace; gets blown away immediately after
+ */
+
+ {
+ r1.l = #LO(_end);
+ r2.l = #LO(stext);
+ r3 = #1;
+ }
+ {
+ r1.h = #HI(_end);
+ r2.h = #HI(stext);
+ r3 = asl(r3, #22);
+ }
+ {
+ r1 = sub(r1, r2);
+ r3 = add(r3, #-1);
+ } /* r1 = _end - stext */
+ r1 = add(r1, r3); /* + (4M-1) */
+ r26 = lsr(r1, #22); /* / 4M = # of entries */
+
+ r1 = r25;
+ r2.h = #0xffc0;
+ r2.l = #0x0000; /* round back down to 4MB boundary */
+ r1 = and(r1,r2);
r2 = lsr(r1, #22) /* 4MB page number */
r2 = asl(r2, #2) /* times sizeof(PTE) (4bytes) */
r0 = add(r0,r2) /* r0 = address of correct PTE */
r2 = #PTE_BITS
r1 = add(r1,r2) /* r1 = 4MB PTE for the first entry */
r2.h = #0x0040
- r2.l = #0x0000 /* 4MB */
- memw(r0 ++ #4) = r1
- r1 = add(r1, r2)
+ r2.l = #0x0000 /* 4MB increments */
+ loop0(1f,r26);
+1:
memw(r0 ++ #4) = r1
+ { r1 = add(r1, r2); } :endloop0
+
+ /* Also need to overwrite the initial 0xc0000000 entries */
+ /* PAGE_OFFSET >> (4MB shift - 4 bytes per entry shift) */
+ R1.H = #HI(PAGE_OFFSET >> (22 - 2))
+ R1.L = #LO(PAGE_OFFSET >> (22 - 2))
+
+ r0 = add(r1, r24); /* advance to 0xc0000000 entry */
+ r1 = r25;
+ r2.h = #0xffc0;
+ r2.l = #0x0000; /* round back down to 4MB boundary */
+ r1 = and(r1,r2); /* for huge page */
+ r2 = #PTE_BITS
+ r1 = add(r1,r2);
+ r2.h = #0x0040
+ r2.l = #0x0000 /* 4MB increments */
- r0 = r24
+ loop0(1f,SEGTABLE_ENTRIES);
+1:
+ memw(r0 ++ #4) = r1;
+ { r1 = add(r1,r2); } :endloop0
+
+ r0 = r24;
/*
* The subroutine wrapper around the virtual instruction touches
* no memory, so we should be able to use it even here.
+ * Note that in this version, R1 and R2 get "clobbered"; see
+ * vm_ops.S
*/
+ r1 = #VM_TRANS_TYPE_TABLE
call __vmnewmap;
/* Jump into virtual address range. */
@@ -90,17 +146,29 @@ ENTRY(stext)
__head_s_vaddr_target:
/*
* Tear down VA=PA translation now that we are running
- * in the desgnated kernel segments.
+ * in kernel virtual space.
*/
r0 = #__HVM_PDE_S_INVALID
- r1 = r24
- loop0(1f,#0x100)
+
+ r1.h = #0xffc0;
+ r1.l = #0x0000;
+ r2 = r25; /* phys_offset */
+ r2 = and(r1,r2);
+
+ r1.l = #lo(swapper_pg_dir)
+ r1.h = #hi(swapper_pg_dir)
+ r2 = lsr(r2, #22) /* 4MB page number */
+ r2 = asl(r2, #2) /* times sizeof(PTE) (4bytes) */
+ r1 = add(r1,r2);
+ loop0(1f,r26)
+
1:
{
memw(R1 ++ #4) = R0
}:endloop0
r0 = r24
+ r1 = #VM_TRANS_TYPE_TABLE
call __vmnewmap
/* Go ahead and install the trap0 return so angel calls work */
@@ -143,6 +211,13 @@ __head_s_vaddr_target:
r2 = sub(r2,r0);
call memset;
+ /* Set PHYS_OFFSET; should be in R25 */
+#ifdef CONFIG_HEXAGON_PHYS_OFFSET
+ r0.l = #LO(__phys_offset);
+ r0.h = #HI(__phys_offset);
+ memw(r0) = r25;
+#endif
+
/* Time to make the doughnuts. */
call start_kernel
diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
index 344645370646..82d5c2593323 100644
--- a/arch/hexagon/kernel/kgdb.c
+++ b/arch/hexagon/kernel/kgdb.c
@@ -1,7 +1,7 @@
/*
* arch/hexagon/kernel/kgdb.c - Hexagon KGDB Support
*
- * Copyright (c) 2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -70,6 +70,8 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
{ "lc1", GDB_SIZEOF_REG, offsetof(struct pt_regs, lc1)},
{ " gp", GDB_SIZEOF_REG, offsetof(struct pt_regs, gp)},
{ "ugp", GDB_SIZEOF_REG, offsetof(struct pt_regs, ugp)},
+ { "cs0", GDB_SIZEOF_REG, offsetof(struct pt_regs, cs0)},
+ { "cs1", GDB_SIZEOF_REG, offsetof(struct pt_regs, cs1)},
{ "psp", GDB_SIZEOF_REG, offsetof(struct pt_regs, hvmer.vmpsp)},
{ "elr", GDB_SIZEOF_REG, offsetof(struct pt_regs, hvmer.vmel)},
{ "est", GDB_SIZEOF_REG, offsetof(struct pt_regs, hvmer.vmest)},
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 9b948c619a03..0a0dd5c05b46 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -24,6 +24,7 @@
#include <linux/tick.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/tracehook.h>
/*
* Program thread launch. Often defined as a macro in processor.h,
@@ -95,7 +96,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
if (unlikely(p->flags & PF_KTHREAD)) {
memset(childregs, 0, sizeof(struct pt_regs));
/* r24 <- fn, r25 <- arg */
- ss->r2524 = usp | ((u64)arg << 32);
+ ss->r24 = usp;
+ ss->r25 = arg;
pt_set_kmode(childregs);
return 0;
}
@@ -185,3 +187,41 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
{
return 0;
}
+
+
+/*
+ * Called on the exit path of event entry; see vm_entry.S
+ *
+ * Interrupts will already be disabled.
+ *
+ * Returns 0 if there's no need to re-check for more work.
+ */
+
+int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
+{
+ if (!(thread_info_flags & _TIF_WORK_MASK)) {
+ return 0;
+ } /* shortcut -- no work to be done */
+
+ local_irq_enable();
+
+ if (thread_info_flags & _TIF_NEED_RESCHED) {
+ schedule();
+ return 1;
+ }
+
+ if (thread_info_flags & _TIF_SIGPENDING) {
+ do_signal(regs);
+ return 1;
+ }
+
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ return 1;
+ }
+
+ /* Should not even reach here */
+ panic("%s: bad thread_info flags 0x%08x\n", __func__,
+ thread_info_flags);
+}
diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c
index 670b1b0bee63..de829eb7f185 100644
--- a/arch/hexagon/kernel/ptrace.c
+++ b/arch/hexagon/kernel/ptrace.c
@@ -1,7 +1,7 @@
/*
* Ptrace support for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -32,6 +32,21 @@
#include <asm/user.h>
+#if arch_has_single_step()
+/* Both called from ptrace_resume */
+void user_enable_single_step(struct task_struct *child)
+{
+ pt_set_singlestep(task_pt_regs(child));
+ set_tsk_thread_flag(child, TIF_SINGLESTEP);
+}
+
+void user_disable_single_step(struct task_struct *child)
+{
+ pt_clr_singlestep(task_pt_regs(child));
+ clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+}
+#endif
+
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
@@ -76,6 +91,10 @@ static int genregs_get(struct task_struct *target,
dummy = pt_cause(regs);
ONEXT(&dummy, cause);
ONEXT(&pt_badva(regs), badva);
+#if CONFIG_HEXAGON_ARCH_VERSION >=4
+ ONEXT(&regs->cs0, cs0);
+ ONEXT(&regs->cs1, cs1);
+#endif
/* Pad the rest with zeros, if needed */
if (!ret)
@@ -123,6 +142,11 @@ static int genregs_set(struct task_struct *target,
INEXT(&bucket, cause);
INEXT(&bucket, badva);
+#if CONFIG_HEXAGON_ARCH_VERSION >=4
+ INEXT(&regs->cs0, cs0);
+ INEXT(&regs->cs1, cs1);
+#endif
+
/* Ignore the rest, if needed */
if (!ret)
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
diff --git a/arch/hexagon/kernel/setup.c b/arch/hexagon/kernel/setup.c
index 94a387835008..bfe13311d70d 100644
--- a/arch/hexagon/kernel/setup.c
+++ b/arch/hexagon/kernel/setup.c
@@ -1,7 +1,7 @@
/*
* Arch related setup for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -68,6 +68,8 @@ void __init setup_arch(char **cmdline_p)
*/
__vmsetvec(_K_VM_event_vector);
+ printk(KERN_INFO "PHYS_OFFSET=0x%08x\n", PHYS_OFFSET);
+
/*
* Simulator has a few differences from the hardware.
* For now, check uninitialized-but-mapped memory
@@ -128,6 +130,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
{
int cpu = (unsigned long) v - 1;
+#ifdef CONFIG_SMP
+ if (!cpu_online(cpu))
+ return 0;
+#endif
+
seq_printf(m, "processor\t: %d\n", cpu);
seq_printf(m, "model name\t: Hexagon Virtual Machine\n");
seq_printf(m, "BogoMips\t: %lu.%02lu\n",
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 60fa2ca3202b..d7c73874b515 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -1,7 +1,7 @@
/*
* Signal support for Hexagon processor
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -41,6 +41,10 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
{
unsigned long sp = regs->r29;
+ /* check if we would overflow the alt stack */
+ if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
+ return (void __user __force *)-1UL;
+
/* Switch to signal stack if appropriate */
if ((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0))
sp = current->sas_ss_sp + current->sas_ss_size;
@@ -66,7 +70,10 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
err |= __put_user(regs->preds, &sc->sc_regs.p3_0);
err |= __put_user(regs->gp, &sc->sc_regs.gp);
err |= __put_user(regs->ugp, &sc->sc_regs.ugp);
-
+#if CONFIG_HEXAGON_ARCH_VERSION >= 4
+ err |= __put_user(regs->cs0, &sc->sc_regs.cs0);
+ err |= __put_user(regs->cs1, &sc->sc_regs.cs1);
+#endif
tmp = pt_elr(regs); err |= __put_user(tmp, &sc->sc_regs.pc);
tmp = pt_cause(regs); err |= __put_user(tmp, &sc->sc_regs.cause);
tmp = pt_badva(regs); err |= __put_user(tmp, &sc->sc_regs.badva);
@@ -93,7 +100,10 @@ static int restore_sigcontext(struct pt_regs *regs,
err |= __get_user(regs->preds, &sc->sc_regs.p3_0);
err |= __get_user(regs->gp, &sc->sc_regs.gp);
err |= __get_user(regs->ugp, &sc->sc_regs.ugp);
-
+#if CONFIG_HEXAGON_ARCH_VERSION >= 4
+ err |= __get_user(regs->cs0, &sc->sc_regs.cs0);
+ err |= __get_user(regs->cs1, &sc->sc_regs.cs1);
+#endif
err |= __get_user(tmp, &sc->sc_regs.pc); pt_set_elr(regs, tmp);
return err;
@@ -193,7 +203,7 @@ static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
/*
* Called from return-from-event code.
*/
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
{
struct k_sigaction sigact;
siginfo_t info;
@@ -210,8 +220,9 @@ static void do_signal(struct pt_regs *regs)
}
/*
- * If we came from a system call, handle the restart.
+ * No (more) signals; if we came from a system call, handle the restart.
*/
+
if (regs->syscall_nr >= 0) {
switch (regs->r00) {
case -ERESTARTNOHAND:
@@ -234,17 +245,6 @@ no_restart:
restore_saved_sigmask();
}
-void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
-{
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
-}
-
/*
* Architecture-specific wrappers for signal-related system calls
*/
@@ -272,21 +272,12 @@ asmlinkage int sys_rt_sigreturn(void)
/* Restore the user's stack as well */
pt_psp(regs) = regs->r29;
- /*
- * Leave a trace in the stack frame that this was a sigreturn.
- * If the system call is to replay, we've already restored the
- * number in the GPR slot and it will be regenerated on the
- * new system call trap entry. Note that if restore_sigcontext()
- * did something other than a bulk copy of the pt_regs struct,
- * we could avoid this assignment by simply not overwriting
- * regs->syscall_nr.
- */
- regs->syscall_nr = __NR_rt_sigreturn;
+ regs->syscall_nr = -1;
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return 0;
+ return regs->r00;
badframe:
force_sig(SIGSEGV, current);
diff --git a/arch/hexagon/kernel/topology.c b/arch/hexagon/kernel/topology.c
deleted file mode 100644
index 352f27e809fd..000000000000
--- a/arch/hexagon/kernel/topology.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * CPU topology for Hexagon
- *
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/node.h>
-#include <linux/nodemask.h>
-#include <linux/percpu.h>
-
-/* Swiped from MIPS. */
-
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
-
-static int __init topology_init(void)
-{
- int i, ret;
-
- for_each_present_cpu(i) {
-
- /*
- * register_cpu takes a per_cpu pointer and
- * just points it at another per_cpu struct...
- */
-
- ret = register_cpu(&per_cpu(cpu_devices, i), i);
- if (ret)
- printk(KERN_WARNING "topology_init: register_cpu %d "
- "failed (%d)\n", i, ret);
- }
-
- return 0;
-}
-
-subsys_initcall(topology_init);
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index cc2171b2aa04..7858663352b9 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -1,7 +1,7 @@
/*
* Kernel traps/events for Hexagon processor
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -65,6 +65,10 @@ static const char *ex_name(int ex)
return "Write protection fault";
case HVM_GE_C_XMAL:
return "Misaligned instruction";
+ case HVM_GE_C_WREG:
+ return "Multiple writes to same register in packet";
+ case HVM_GE_C_PCAL:
+ return "Program counter values that are not properly aligned";
case HVM_GE_C_RMAL:
return "Misaligned data load";
case HVM_GE_C_WMAL:
@@ -316,6 +320,12 @@ void do_genex(struct pt_regs *regs)
case HVM_GE_C_XMAL:
misaligned_instruction(regs);
break;
+ case HVM_GE_C_WREG:
+ illegal_instruction(regs);
+ break;
+ case HVM_GE_C_PCAL:
+ misaligned_instruction(regs);
+ break;
case HVM_GE_C_RMAL:
misaligned_data_load(regs);
break;
@@ -348,7 +358,6 @@ long sys_syscall(void)
void do_trap0(struct pt_regs *regs)
{
- unsigned long syscallret = 0;
syscall_fn syscall;
switch (pt_cause(regs)) {
@@ -388,21 +397,11 @@ void do_trap0(struct pt_regs *regs)
} else {
syscall = (syscall_fn)
(sys_call_table[regs->syscall_nr]);
- syscallret = syscall(regs->r00, regs->r01,
+ regs->r00 = syscall(regs->r00, regs->r01,
regs->r02, regs->r03,
regs->r04, regs->r05);
}
- /*
- * If it was a sigreturn system call, don't overwrite
- * r0 value in stack frame with return value.
- *
- * __NR_sigreturn doesn't seem to exist in new unistd.h
- */
-
- if (regs->syscall_nr != __NR_rt_sigreturn)
- regs->r00 = syscallret;
-
/* allow strace to get the syscall return state */
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACE)))
tracehook_report_syscall_exit(regs, 0);
@@ -444,3 +443,14 @@ void do_machcheck(struct pt_regs *regs)
/* Halt and catch fire */
__vmstop();
}
+
+/*
+ * Treat this like the old 0xdb trap.
+ */
+
+void do_debug_exception(struct pt_regs *regs)
+{
+ regs->hvmer.vmest &= ~HVM_VMEST_CAUSE_MSK;
+ regs->hvmer.vmest |= (TRAP_DEBUG << HVM_VMEST_CAUSE_SFT);
+ do_trap0(regs);
+}
diff --git a/arch/hexagon/kernel/vm_entry.S b/arch/hexagon/kernel/vm_entry.S
index 425e50c694f7..e3086185fc9f 100644
--- a/arch/hexagon/kernel/vm_entry.S
+++ b/arch/hexagon/kernel/vm_entry.S
@@ -1,7 +1,7 @@
/*
* Event entry/exit for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -45,48 +45,88 @@
* number in the case where we decode a system call (trap0(#1)).
*/
+#if CONFIG_HEXAGON_ARCH_VERSION < 4
#define save_pt_regs()\
- memd(R0 + #_PT_R3130) = R31:30; \
+ memd(R0 + #_PT_R3130) = R31:30; \
+ { memw(R0 + #_PT_R2928) = R28; \
+ R31 = memw(R0 + #_PT_ER_VMPSP); }\
+ { memw(R0 + #(_PT_R2928 + 4)) = R31; \
+ R31 = ugp; } \
+ { memd(R0 + #_PT_R2726) = R27:26; \
+ R30 = gp ; } \
+ memd(R0 + #_PT_R2524) = R25:24; \
+ memd(R0 + #_PT_R2322) = R23:22; \
+ memd(R0 + #_PT_R2120) = R21:20; \
+ memd(R0 + #_PT_R1918) = R19:18; \
+ memd(R0 + #_PT_R1716) = R17:16; \
+ memd(R0 + #_PT_R1514) = R15:14; \
+ memd(R0 + #_PT_R1312) = R13:12; \
+ { memd(R0 + #_PT_R1110) = R11:10; \
+ R15 = lc0; } \
+ { memd(R0 + #_PT_R0908) = R9:8; \
+ R14 = sa0; } \
+ { memd(R0 + #_PT_R0706) = R7:6; \
+ R13 = lc1; } \
+ { memd(R0 + #_PT_R0504) = R5:4; \
+ R12 = sa1; } \
+ { memd(R0 + #_PT_GPUGP) = R31:30; \
+ R11 = m1; \
+ R2.H = #HI(_THREAD_SIZE); } \
+ { memd(R0 + #_PT_LC0SA0) = R15:14; \
+ R10 = m0; \
+ R2.L = #LO(_THREAD_SIZE); } \
+ { memd(R0 + #_PT_LC1SA1) = R13:12; \
+ R15 = p3:0; \
+ R2 = neg(R2); } \
+ { memd(R0 + #_PT_M1M0) = R11:10; \
+ R14 = usr; \
+ R2 = and(R0,R2); } \
+ { memd(R0 + #_PT_PREDSUSR) = R15:14; \
+ THREADINFO_REG = R2; } \
+ { r24 = memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS); \
+ memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS) = R0; \
+ R2 = #-1; } \
+ { memw(R0 + #_PT_SYSCALL_NR) = R2; \
+ R30 = #0; }
+#else
+/* V4+ */
+/* the # ## # syntax inserts a literal ## */
+#define save_pt_regs()\
+ { memd(R0 + #_PT_R3130) = R31:30; \
+ R30 = memw(R0 + #_PT_ER_VMPSP); }\
{ memw(R0 + #_PT_R2928) = R28; \
- R31 = memw(R0 + #_PT_ER_VMPSP); }\
- { memw(R0 + #(_PT_R2928 + 4)) = R31; \
- R31 = ugp; } \
- { memd(R0 + #_PT_R2726) = R27:26; \
- R30 = gp ; } \
- memd(R0 + #_PT_R2524) = R25:24; \
- memd(R0 + #_PT_R2322) = R23:22; \
- memd(R0 + #_PT_R2120) = R21:20; \
- memd(R0 + #_PT_R1918) = R19:18; \
- memd(R0 + #_PT_R1716) = R17:16; \
- memd(R0 + #_PT_R1514) = R15:14; \
- memd(R0 + #_PT_R1312) = R13:12; \
+ memw(R0 + #(_PT_R2928 + 4)) = R30; }\
+ { R31:30 = C11:10; \
+ memd(R0 + #_PT_R2726) = R27:26; \
+ memd(R0 + #_PT_R2524) = R25:24; }\
+ { memd(R0 + #_PT_R2322) = R23:22; \
+ memd(R0 + #_PT_R2120) = R21:20; }\
+ { memd(R0 + #_PT_R1918) = R19:18; \
+ memd(R0 + #_PT_R1716) = R17:16; }\
+ { memd(R0 + #_PT_R1514) = R15:14; \
+ memd(R0 + #_PT_R1312) = R13:12; \
+ R17:16 = C13:12; }\
{ memd(R0 + #_PT_R1110) = R11:10; \
- R15 = lc0; } \
- { memd(R0 + #_PT_R0908) = R9:8; \
- R14 = sa0; } \
+ memd(R0 + #_PT_R0908) = R9:8; \
+ R15:14 = C1:0; } \
{ memd(R0 + #_PT_R0706) = R7:6; \
- R13 = lc1; } \
- { memd(R0 + #_PT_R0504) = R5:4; \
- R12 = sa1; } \
- { memd(R0 + #_PT_UGPGP) = R31:30; \
- R11 = m1; \
- R2.H = #HI(_THREAD_SIZE); } \
- { memd(R0 + #_PT_LC0SA0) = R15:14; \
- R10 = m0; \
- R2.L = #LO(_THREAD_SIZE); } \
- { memd(R0 + #_PT_LC1SA1) = R13:12; \
- R15 = p3:0; \
- R2 = neg(R2); } \
+ memd(R0 + #_PT_R0504) = R5:4; \
+ R13:12 = C3:2; } \
+ { memd(R0 + #_PT_GPUGP) = R31:30; \
+ memd(R0 + #_PT_LC0SA0) = R15:14; \
+ R11:10 = C7:6; }\
+ { THREADINFO_REG = and(R0, # ## #-_THREAD_SIZE); \
+ memd(R0 + #_PT_LC1SA1) = R13:12; \
+ R15 = p3:0; }\
{ memd(R0 + #_PT_M1M0) = R11:10; \
- R14 = usr; \
- R2 = and(R0,R2); } \
- { memd(R0 + #_PT_PREDSUSR) = R15:14; \
- THREADINFO_REG = R2; } \
+ memw(R0 + #_PT_PREDSUSR + 4) = R15; }\
{ r24 = memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS); \
memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS) = R0; \
R2 = #-1; } \
{ memw(R0 + #_PT_SYSCALL_NR) = R2; \
+ memd(R0 + #_PT_CS1CS0) = R17:16; \
R30 = #0; }
+#endif
/*
* Restore registers and thread_info.regs state. THREADINFO_REG
@@ -94,6 +134,7 @@
* preserved. Don't restore R29 (SP) until later.
*/
+#if CONFIG_HEXAGON_ARCH_VERSION < 4
#define restore_pt_regs() \
{ memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS) = R24; \
R15:14 = memd(R0 + #_PT_PREDSUSR); } \
@@ -121,11 +162,44 @@
R23:22 = memd(R0 + #_PT_R2322); } \
{ R25:24 = memd(R0 + #_PT_R2524); \
R27:26 = memd(R0 + #_PT_R2726); } \
- R31:30 = memd(R0 + #_PT_UGPGP); \
+ R31:30 = memd(R0 + #_PT_GPUGP); \
{ R28 = memw(R0 + #_PT_R2928); \
ugp = R31; } \
{ R31:30 = memd(R0 + #_PT_R3130); \
gp = R30; }
+#else
+/* V4+ */
+#define restore_pt_regs() \
+ { memw(THREADINFO_REG + #_THREAD_INFO_PT_REGS) = R24; \
+ R15:14 = memd(R0 + #_PT_PREDSUSR); } \
+ { R11:10 = memd(R0 + #_PT_M1M0); \
+ R13:12 = memd(R0 + #_PT_LC1SA1); \
+ p3:0 = R15; } \
+ { R15:14 = memd(R0 + #_PT_LC0SA0); \
+ R3:2 = memd(R0 + #_PT_R0302); \
+ usr = R14; } \
+ { R5:4 = memd(R0 + #_PT_R0504); \
+ R7:6 = memd(R0 + #_PT_R0706); \
+ C7:6 = R11:10; }\
+ { R9:8 = memd(R0 + #_PT_R0908); \
+ R11:10 = memd(R0 + #_PT_R1110); \
+ C3:2 = R13:12; }\
+ { R13:12 = memd(R0 + #_PT_R1312); \
+ R15:14 = memd(R0 + #_PT_R1514); \
+ C1:0 = R15:14; }\
+ { R17:16 = memd(R0 + #_PT_R1716); \
+ R19:18 = memd(R0 + #_PT_R1918); } \
+ { R21:20 = memd(R0 + #_PT_R2120); \
+ R23:22 = memd(R0 + #_PT_R2322); } \
+ { R25:24 = memd(R0 + #_PT_R2524); \
+ R27:26 = memd(R0 + #_PT_R2726); } \
+ R31:30 = memd(R0 + #_PT_CS1CS0); \
+ { C13:12 = R31:30; \
+ R31:30 = memd(R0 + #_PT_GPUGP) ; \
+ R28 = memw(R0 + #_PT_R2928); }\
+ { C11:10 = R31:30; \
+ R31:30 = memd(R0 + #_PT_R3130); }
+#endif
/*
* Clears off enough space for the rest of pt_regs; evrec is a part
@@ -139,6 +213,7 @@
* Need to save off R0, R1, R2, R3 immediately.
*/
+#if CONFIG_HEXAGON_ARCH_VERSION < 4
#define vm_event_entry(CHandler) \
{ \
R29 = add(R29, #-(_PT_REGS_SIZE)); \
@@ -158,6 +233,34 @@
R1.H = #HI(CHandler); \
jump event_dispatch; \
}
+#else
+/* V4+ */
+/* turn on I$ prefetch early */
+/* the # ## # syntax inserts a literal ## */
+#define vm_event_entry(CHandler) \
+ { \
+ R29 = add(R29, #-(_PT_REGS_SIZE)); \
+ memd(R29 + #(_PT_R0100 + -_PT_REGS_SIZE)) = R1:0; \
+ memd(R29 + #(_PT_R0302 + -_PT_REGS_SIZE)) = R3:2; \
+ R0 = usr; \
+ } \
+ { \
+ memw(R29 + #_PT_PREDSUSR) = R0; \
+ R0 = setbit(R0, #16); \
+ } \
+ usr = R0; \
+ R1:0 = G1:0; \
+ { \
+ memd(R29 + #_PT_ER_VMEL) = R1:0; \
+ R1 = # ## #(CHandler); \
+ R3:2 = G3:2; \
+ } \
+ { \
+ R0 = R29; \
+ memd(R29 + #_PT_ER_VMPSP) = R3:2; \
+ jump event_dispatch; \
+ }
+#endif
.text
/*
@@ -171,6 +274,9 @@ event_dispatch:
callr r1
/*
+ * Coming back from the C-world, our thread info pointer
+ * should be in the designated register (usually R19)
+ *
* If we were in kernel mode, we don't need to check scheduler
* or signals if CONFIG_PREEMPT is not set. If set, then it has
* to jump to a need_resched kind of block.
@@ -183,69 +289,68 @@ event_dispatch:
#endif
/* "Nested control path" -- if the previous mode was kernel */
- R0 = memw(R29 + #_PT_ER_VMEST);
- P0 = tstbit(R0, #HVM_VMEST_UM_SFT);
- if !P0 jump restore_all;
- /*
- * Returning from system call, normally coming back from user mode
- */
-return_from_syscall:
- /* Disable interrupts while checking TIF */
- R0 = #VM_INT_DISABLE
- trap1(#HVM_TRAP1_VMSETIE)
-
- /*
- * Coming back from the C-world, our thread info pointer
- * should be in the designated register (usually R19)
- */
- R1.L = #LO(_TIF_ALLWORK_MASK)
{
- R1.H = #HI(_TIF_ALLWORK_MASK);
- R0 = memw(THREADINFO_REG + #_THREAD_INFO_FLAGS);
+ R0 = memw(R29 + #_PT_ER_VMEST);
+ R16.L = #LO(do_work_pending);
+ }
+ {
+ P0 = tstbit(R0, #HVM_VMEST_UM_SFT);
+ if (!P0.new) jump:nt restore_all;
+ R16.H = #HI(do_work_pending);
+ R0 = #VM_INT_DISABLE;
}
/*
- * Compare against the "return to userspace" _TIF_WORK_MASK
+ * Check also the return from fork/system call, normally coming back from
+ * user mode
+ *
+ * R16 needs to have do_work_pending, and R0 should have VM_INT_DISABLE
*/
- R1 = and(R1,R0);
- { P0 = cmp.eq(R1,#0); if (!P0.new) jump:t work_pending;}
- jump restore_all; /* we're outta here! */
-work_pending:
+check_work_pending:
+ /* Disable interrupts while checking TIF */
+ trap1(#HVM_TRAP1_VMSETIE)
{
- P0 = tstbit(R1, #TIF_NEED_RESCHED);
- if (!P0.new) jump:nt work_notifysig;
+ R0 = R29; /* regs should still be at top of stack */
+ R1 = memw(THREADINFO_REG + #_THREAD_INFO_FLAGS);
+ callr R16;
}
- call schedule
- jump return_from_syscall; /* check for more work */
-work_notifysig:
- /* this is the part that's kind of fuzzy. */
- R1 = and(R0, #(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME));
- P0 = cmp.eq(R1, #0);
- if P0 jump restore_all
- R1 = R0; /* unsigned long thread_info_flags */
- R0 = R29; /* regs should still be at top of stack */
- call do_notify_resume
+ {
+ P0 = cmp.eq(R0, #0); if (!P0.new) jump:nt check_work_pending;
+ R0 = #VM_INT_DISABLE;
+ }
restore_all:
- /* Disable interrupts, if they weren't already, before reg restore. */
- R0 = #VM_INT_DISABLE
+ /*
+ * Disable interrupts, if they weren't already, before reg restore.
+ * R0 gets preloaded with #VM_INT_DISABLE before we get here.
+ */
trap1(#HVM_TRAP1_VMSETIE)
/* do the setregs here for VM 0.5 */
/* R29 here should already be pointing at pt_regs */
- R1:0 = memd(R29 + #_PT_ER_VMEL);
- R3:2 = memd(R29 + #_PT_ER_VMPSP);
+ {
+ R1:0 = memd(R29 + #_PT_ER_VMEL);
+ R3:2 = memd(R29 + #_PT_ER_VMPSP);
+ }
+#if CONFIG_HEXAGON_ARCH_VERSION < 4
trap1(#HVM_TRAP1_VMSETREGS);
+#else
+ G1:0 = R1:0;
+ G3:2 = R3:2;
+#endif
R0 = R29
restore_pt_regs()
- R1:0 = memd(R29 + #_PT_R0100);
- R29 = add(R29, #_PT_REGS_SIZE);
+ {
+ R1:0 = memd(R29 + #_PT_R0100);
+ R29 = add(R29, #_PT_REGS_SIZE);
+ }
trap1(#HVM_TRAP1_VMRTE)
/* Notreached */
+
.globl _K_enter_genex
_K_enter_genex:
vm_event_entry(do_genex)
@@ -262,12 +367,27 @@ _K_enter_trap0:
_K_enter_machcheck:
vm_event_entry(do_machcheck)
+ .globl _K_enter_debug
+_K_enter_debug:
+ vm_event_entry(do_debug_exception)
.globl ret_from_fork
ret_from_fork:
- call schedule_tail
- P0 = cmp.eq(R24, #0);
- if P0 jump return_from_syscall
- R0 = R25;
- callr R24
- jump return_from_syscall
+ {
+ call schedule_tail
+ R16.H = #HI(do_work_pending);
+ }
+ {
+ P0 = cmp.eq(R24, #0);
+ R16.L = #LO(do_work_pending);
+ R0 = #VM_INT_DISABLE;
+ }
+ if P0 jump check_work_pending
+ {
+ R0 = R25;
+ callr R24
+ }
+ {
+ jump check_work_pending
+ R0 = #VM_INT_DISABLE;
+ }
diff --git a/arch/hexagon/kernel/vm_events.c b/arch/hexagon/kernel/vm_events.c
index f337281ebe67..741aaa917cda 100644
--- a/arch/hexagon/kernel/vm_events.c
+++ b/arch/hexagon/kernel/vm_events.c
@@ -1,7 +1,7 @@
/*
* Mostly IRQ support for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -44,6 +44,8 @@ void show_regs(struct pt_regs *regs)
regs->lc1, regs->sa1, regs->m1);
printk(KERN_EMERG "gp: \t0x%08lx ugp: 0x%08lx usr: 0x%08lx\n",
regs->gp, regs->ugp, regs->usr);
+ printk(KERN_EMERG "cs0: \t0x%08lx cs1: 0x%08lx\n",
+ regs->cs0, regs->cs1);
printk(KERN_EMERG "r0: \t0x%08lx %08lx %08lx %08lx\n", regs->r00,
regs->r01,
regs->r02,
diff --git a/arch/hexagon/kernel/vm_vectors.S b/arch/hexagon/kernel/vm_vectors.S
index 620f42cc582a..791a7422dde4 100644
--- a/arch/hexagon/kernel/vm_vectors.S
+++ b/arch/hexagon/kernel/vm_vectors.S
@@ -1,7 +1,7 @@
/*
* Event jump tables
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2012,2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -41,7 +41,7 @@ _K_VM_event_vector:
jump 1b; /* Reset */
jump _K_enter_machcheck;
jump _K_enter_genex;
- jump 1b; /* 3 Rsvd */
+ jump _K_enter_debug;
jump 1b; /* 4 Rsvd */
jump _K_enter_trap0;
jump 1b; /* 6 Rsvd */
diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S
index 14e793f6abbf..44d8c47bae2f 100644
--- a/arch/hexagon/kernel/vmlinux.lds.S
+++ b/arch/hexagon/kernel/vmlinux.lds.S
@@ -1,7 +1,7 @@
/*
* Linker script for Hexagon kernel
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -18,8 +18,6 @@
* 02110-1301, USA.
*/
-#define LOAD_OFFSET PAGE_OFFSET
-
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h> /* Most of the kernel defines are here */
#include <asm/mem-layout.h> /* except for page_offset */
@@ -36,13 +34,9 @@ See asm-generic/sections.h for seemingly required labels.
#define PAGE_SIZE _PAGE_SIZE
-/* This LOAD_OFFSET is temporary for debugging on the simulator; it may change
- for hypervisor pseudo-physical memory. */
-
-
SECTIONS
{
- . = PAGE_OFFSET + LOAD_ADDRESS;
+ . = PAGE_OFFSET;
__init_begin = .;
HEAD_TEXT_SECTION
@@ -52,7 +46,7 @@ SECTIONS
. = ALIGN(_PAGE_SIZE);
_stext = .;
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
+ .text : AT(ADDR(.text)) {
_text = .;
TEXT_TEXT
SCHED_TEXT
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 69ffcfd28794..2561d259a296 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -1,7 +1,7 @@
/*
* Memory subsystem initialization for Hexagon
*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -31,9 +31,10 @@
* Define a startpg just past the end of the kernel image and a lastpg
* that corresponds to the end of real or simulated platform memory.
*/
-#define bootmem_startpg (PFN_UP(((unsigned long) _end) - PAGE_OFFSET))
+#define bootmem_startpg (PFN_UP(((unsigned long) _end) - PAGE_OFFSET + PHYS_OFFSET))
-unsigned long bootmem_lastpg; /* Should be set by platform code */
+unsigned long bootmem_lastpg; /* Should be set by platform code */
+unsigned long __phys_offset; /* physical kernel offset >> 12 */
/* Set as variable to limit PMD copies */
int max_kernel_seg = 0x303;
@@ -44,7 +45,6 @@ unsigned long zero_page_mask;
/* indicate pfn's of high memory */
unsigned long highstart_pfn, highend_pfn;
-/* struct mmu_gather defined in asm-generic.h; */
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
/* Default cache attribute for newly created page tables */
@@ -71,7 +71,7 @@ void __init mem_init(void)
{
/* No idea where this is actually declared. Seems to evade LXR. */
totalram_pages += free_all_bootmem();
- num_physpages = bootmem_lastpg; /* seriously, what? */
+ num_physpages = bootmem_lastpg-ARCH_PFN_OFFSET;
printk(KERN_INFO "totalram_pages = %ld\n", totalram_pages);
@@ -193,6 +193,9 @@ void __init setup_arch_memory(void)
* This needs to change for highmem setups.
*/
+ /* Prior to this, bootmem_lastpg is actually mem size */
+ bootmem_lastpg += ARCH_PFN_OFFSET;
+
/* Memory size needs to be a multiple of 16M */
bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) &
~((BIG_KERNEL_PAGE_SIZE) - 1));
@@ -201,12 +204,15 @@ void __init setup_arch_memory(void)
* Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached)
* memory allocation
*/
- bootmap_size = init_bootmem(bootmem_startpg, bootmem_lastpg -
- PFN_DOWN(DMA_RESERVED_BYTES));
+
+ max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES);
+ min_low_pfn = ARCH_PFN_OFFSET;
+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn);
printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg);
printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg);
printk(KERN_INFO "bootmap_size: %d\n", bootmap_size);
+ printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn);
printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn);
/*
@@ -221,14 +227,17 @@ void __init setup_arch_memory(void)
/* this actually only goes to the end of the first gig */
segtable_end = segtable + (1<<(30-22));
- /* Move forward to the start of empty pages */
- segtable += bootmem_lastpg >> (22-PAGE_SHIFT);
+ /*
+ * Move forward to the start of empty pages; take into account
+ * phys_offset shift.
+ */
+ segtable += (bootmem_lastpg-ARCH_PFN_OFFSET)>>(22-PAGE_SHIFT);
{
- int i;
+ int i;
- for (i = 1 ; i <= DMA_RESERVE ; i++)
- segtable[-i] = ((segtable[-i] & __HVM_PTE_PGMASK_4MB)
+ for (i = 1 ; i <= DMA_RESERVE ; i++)
+ segtable[-i] = ((segtable[-i] & __HVM_PTE_PGMASK_4MB)
| __HVM_PTE_R | __HVM_PTE_W | __HVM_PTE_X
| __HEXAGON_C_UNC << 6
| __HVM_PDE_S_4MB);
@@ -256,7 +265,7 @@ void __init setup_arch_memory(void)
* Free all the memory that wasn't taken up by the bootmap, the DMA
* reserve, or kernel itself.
*/
- free_bootmem(PFN_PHYS(bootmem_startpg)+bootmap_size,
+ free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size,
PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size -
DMA_RESERVED_BYTES);
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 308ef0ce648b..1bd276dbec7d 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -147,7 +147,7 @@ good_area:
}
info.si_errno = 0;
info.si_addr = (void __user *)address;
- force_sig_info(info.si_code, &info, current);
+ force_sig_info(info.si_signo, &info, current);
return;
bad_area:
@@ -158,7 +158,7 @@ bad_area:
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void *)address;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_info(info.si_signo, &info, current);
return;
}
/* Kernel-mode fault falls through */
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 9ab3bf2eca8d..81b9ddbc9166 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -55,9 +55,6 @@ config TRACE_IRQFLAGS_SUPPORT
config GENERIC_CSUM
def_bool y
-config GENERIC_FIND_NEXT_BIT
- def_bool y
-
source "init/Kconfig"
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index 2a7cb0df176b..08fe897c0b4c 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -27,9 +27,18 @@
#include <linux/dma-buf.h>
#include <linux/anon_inodes.h>
#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
static inline int is_dma_buf_file(struct file *);
+struct dma_buf_list {
+ struct list_head head;
+ struct mutex lock;
+};
+
+static struct dma_buf_list db_list;
+
static int dma_buf_release(struct inode *inode, struct file *file)
{
struct dma_buf *dmabuf;
@@ -42,6 +51,11 @@ static int dma_buf_release(struct inode *inode, struct file *file)
BUG_ON(dmabuf->vmapping_counter);
dmabuf->ops->release(dmabuf);
+
+ mutex_lock(&db_list.lock);
+ list_del(&dmabuf->list_node);
+ mutex_unlock(&db_list.lock);
+
kfree(dmabuf);
return 0;
}
@@ -77,22 +91,24 @@ static inline int is_dma_buf_file(struct file *file)
}
/**
- * dma_buf_export - Creates a new dma_buf, and associates an anon file
+ * dma_buf_export_named - Creates a new dma_buf, and associates an anon file
* with this buffer, so it can be exported.
* Also connect the allocator specific data and ops to the buffer.
+ * Additionally, provide a name string for exporter; useful in debugging.
*
* @priv: [in] Attach private data of allocator to this buffer
* @ops: [in] Attach allocator-defined dma buf ops to the new buffer.
* @size: [in] Size of the buffer
* @flags: [in] mode flags for the file.
+ * @exp_name: [in] name of the exporting module - useful for debugging.
*
* Returns, on success, a newly created dma_buf object, which wraps the
* supplied private data and operations for dma_buf_ops. On either missing
* ops, or error in allocating struct dma_buf, will return negative error.
*
*/
-struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
- size_t size, int flags)
+struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops,
+ size_t size, int flags, const char *exp_name)
{
struct dma_buf *dmabuf;
struct file *file;
@@ -114,6 +130,7 @@ struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
dmabuf->priv = priv;
dmabuf->ops = ops;
dmabuf->size = size;
+ dmabuf->exp_name = exp_name;
file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf, flags);
@@ -122,9 +139,13 @@ struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
mutex_init(&dmabuf->lock);
INIT_LIST_HEAD(&dmabuf->attachments);
+ mutex_lock(&db_list.lock);
+ list_add(&dmabuf->list_node, &db_list.head);
+ mutex_unlock(&db_list.lock);
+
return dmabuf;
}
-EXPORT_SYMBOL_GPL(dma_buf_export);
+EXPORT_SYMBOL_GPL(dma_buf_export_named);
/**
@@ -548,3 +569,143 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
mutex_unlock(&dmabuf->lock);
}
EXPORT_SYMBOL_GPL(dma_buf_vunmap);
+
+#ifdef CONFIG_DEBUG_FS
+static int dma_buf_describe(struct seq_file *s)
+{
+ int ret;
+ struct dma_buf *buf_obj;
+ struct dma_buf_attachment *attach_obj;
+ int count = 0, attach_count;
+ size_t size = 0;
+
+ ret = mutex_lock_interruptible(&db_list.lock);
+
+ if (ret)
+ return ret;
+
+ seq_printf(s, "\nDma-buf Objects:\n");
+ seq_printf(s, "\texp_name\tsize\tflags\tmode\tcount\n");
+
+ list_for_each_entry(buf_obj, &db_list.head, list_node) {
+ ret = mutex_lock_interruptible(&buf_obj->lock);
+
+ if (ret) {
+ seq_printf(s,
+ "\tERROR locking buffer object: skipping\n");
+ continue;
+ }
+
+ seq_printf(s, "\t");
+
+ seq_printf(s, "\t%s\t%08zu\t%08x\t%08x\t%08ld\n",
+ buf_obj->exp_name, buf_obj->size,
+ buf_obj->file->f_flags, buf_obj->file->f_mode,
+ (long)(buf_obj->file->f_count.counter));
+
+ seq_printf(s, "\t\tAttached Devices:\n");
+ attach_count = 0;
+
+ list_for_each_entry(attach_obj, &buf_obj->attachments, node) {
+ seq_printf(s, "\t\t");
+
+ seq_printf(s, "%s\n", attach_obj->dev->init_name);
+ attach_count++;
+ }
+
+ seq_printf(s, "\n\t\tTotal %d devices attached\n",
+ attach_count);
+
+ count++;
+ size += buf_obj->size;
+ mutex_unlock(&buf_obj->lock);
+ }
+
+ seq_printf(s, "\nTotal %d objects, %zu bytes\n", count, size);
+
+ mutex_unlock(&db_list.lock);
+ return 0;
+}
+
+static int dma_buf_show(struct seq_file *s, void *unused)
+{
+ void (*func)(struct seq_file *) = s->private;
+ func(s);
+ return 0;
+}
+
+static int dma_buf_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dma_buf_show, inode->i_private);
+}
+
+static const struct file_operations dma_buf_debug_fops = {
+ .open = dma_buf_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *dma_buf_debugfs_dir;
+
+static int dma_buf_init_debugfs(void)
+{
+ int err = 0;
+ dma_buf_debugfs_dir = debugfs_create_dir("dma_buf", NULL);
+ if (IS_ERR(dma_buf_debugfs_dir)) {
+ err = PTR_ERR(dma_buf_debugfs_dir);
+ dma_buf_debugfs_dir = NULL;
+ return err;
+ }
+
+ err = dma_buf_debugfs_create_file("bufinfo", dma_buf_describe);
+
+ if (err)
+ pr_debug("dma_buf: debugfs: failed to create node bufinfo\n");
+
+ return err;
+}
+
+static void dma_buf_uninit_debugfs(void)
+{
+ if (dma_buf_debugfs_dir)
+ debugfs_remove_recursive(dma_buf_debugfs_dir);
+}
+
+int dma_buf_debugfs_create_file(const char *name,
+ int (*write)(struct seq_file *))
+{
+ struct dentry *d;
+
+ d = debugfs_create_file(name, S_IRUGO, dma_buf_debugfs_dir,
+ write, &dma_buf_debug_fops);
+
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ return 0;
+}
+#else
+static inline int dma_buf_init_debugfs(void)
+{
+ return 0;
+}
+static inline void dma_buf_uninit_debugfs(void)
+{
+}
+#endif
+
+static int __init dma_buf_init(void)
+{
+ mutex_init(&db_list.lock);
+ INIT_LIST_HEAD(&db_list.head);
+ dma_buf_init_debugfs();
+ return 0;
+}
+subsys_initcall(dma_buf_init);
+
+static void __exit dma_buf_deinit(void)
+{
+ dma_buf_uninit_debugfs();
+}
+__exitcall(dma_buf_deinit);
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a62798fcc014..59bfaecc4e14 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -681,6 +681,9 @@ static int ptmx_open(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
+ /* We refuse fsnotify events on ptmx, since it's a shared resource */
+ filp->f_mode |= FMODE_NONOTIFY;
+
retval = tty_alloc_file(filp);
if (retval)
return retval;
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 97ebc8c5864e..6464029e4860 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -988,10 +988,10 @@ void start_tty(struct tty_struct *tty)
EXPORT_SYMBOL(start_tty);
+/* We limit tty time update visibility to every 8 seconds or so. */
static void tty_update_time(struct timespec *time)
{
- unsigned long sec = get_seconds();
- sec -= sec % 60;
+ unsigned long sec = get_seconds() & ~7;
if ((long)(sec - time->tv_sec) > 0)
time->tv_sec = sec;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 10ef81e10b20..bc1fe14aaa3e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
+ if (buffer_meta(bh))
+ rw |= REQ_META;
+ if (buffer_prio(bh))
+ rw |= REQ_PRIO;
+
bio_get(bio);
submit_bio(rw, bio);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb9..efea5d5c44ce 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
- with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+ with a command such as:
+ echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92e68b33fffd..d0f13eada0ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
*/
/*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block)
+{
+ ext4_group_t group;
+
+ if (test_opt2(sb, STD_GROUP_SIZE))
+ group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+ block) >>
+ (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+ else
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ return group;
+}
+
+/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
*/
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
- ext4_group_t block_group)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+ ext4_fsblk_t block,
+ ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
- if (actual_group == block_group)
- return 1;
- return 0;
+
+ actual_group = ext4_get_group_number(sb, block);
+ return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
- s64 free_clusters, dirty_clusters, root_clusters;
+ s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
+ resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
- root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+ resv_clusters;
- if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+ if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
- if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+ if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ if (free_clusters >= (nclusters + dirty_clusters +
+ resv_clusters))
+ return 1;
+ }
+ /* No free blocks. Let's see if we can dip into reserved pool */
+ if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d8cd1f0f4661..f8d56e4254e0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+ ext4_has_inline_data(inode)))
return 1;
return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
- if (ext4_has_inline_data(inode)) {
- int has_inline_data = 1;
- ret = ext4_read_inline_dir(filp, dirent, filldir,
- &has_inline_data);
- if (has_inline_data)
- return ret;
- }
-
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
ext4_clear_inode_flag(file_inode(filp),
EXT4_INODE_INDEX);
}
+
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ ret = ext4_read_inline_dir(filp, dirent, filldir,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b83cd604796..0aabb344b02e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_STREAM_ALLOC 0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED 0x2000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -196,19 +198,8 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_DIRECT 0x0004
-struct ext4_io_page {
- struct page *p_page;
- atomic_t p_count;
-};
-
-#define MAX_IO_PAGES 128
-
/*
* For converting uninitialized extents on a work queue.
- *
- * 'page' is only used from the writepage() path; 'pages' is only used for
- * buffered writes; they are used to keep page references until conversion
- * takes place. For AIO/DIO, neither field is filled in.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
- int num_io_pages; /* for writepages() */
- struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
+ atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
int io_op;
struct bio *io_bio;
ext4_io_end_t *io_end;
- struct ext4_io_page *io_page;
sector_t io_next_block;
};
@@ -403,7 +392,7 @@ struct flex_groups {
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +546,8 @@ enum {
#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
- /* Caller is from the delayed allocation writeout path,
- so set the magic i_delalloc_reserve_flag after taking the
- inode allocation semaphore for */
+ /* Caller is from the delayed allocation writeout path
+ * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
@@ -571,8 +559,9 @@ enum {
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
- /* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
+ /* Eventual metadata allocation (due to growing extent tree)
+ * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
@@ -616,6 +605,7 @@ enum {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -949,7 +939,7 @@ struct ext4_inode_info {
#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
/*
- * Mount flags
+ * Mount flags set via mount options or defaults
*/
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -981,8 +971,16 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
+ size of blocksize * 8
+ blocks */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
unsigned int s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
+ atomic64_t s_resv_clusters;
kuid_t s_resuid;
kgid_t s_resgid;
unsigned short s_mount_state;
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
return ino == EXT4_ROOT_INO ||
ino == EXT4_USR_QUOTA_INO ||
ino == EXT4_GRP_QUOTA_INO ||
+ ino == EXT4_BOOT_LOADER_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
@@ -1374,6 +1374,7 @@ enum {
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
+ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR -75000
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
- ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct buffer_head *bh);
/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp,
+ ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block);
+
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
unsigned int block_group,
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
-extern void ext4_ind_truncate(struct inode *inode);
-extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
extern int ext4_read_inline_dir(struct file *filp,
void *dirent, filldir_t filldir,
int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
- loff_t length);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+ struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
/* page-io.c */
extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 8643ff5bbeb7..51bc821ade90 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path);
+
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7058975e3a55..451eb4045330 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
{
journal_t *journal;
+ might_sleep();
+
trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
{
int err = 0;
+ might_sleep();
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
{
int err = 0;
+ might_sleep();
+
+ set_buffer_meta(bh);
+ set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4c216b1bf20c..c8c6885406db 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 27U : 8U)
+ ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
*/
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
+ bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
+ deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
+ return deleted;
}
int
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c6d06dcef8b..107936db244e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-#define ext4_ext_dirty(handle, inode, path) \
- __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-static int __ext4_ext_dirty(const char *where, unsigned int line,
- handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path)
{
int err;
if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
return -EIO;
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
- && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
- ext4_ext_get_actual_len(ex),
- ext4_ext_pblock(ex));
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- return err;
+ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
/*
- * ext4_can_extents_be_merged should have checked that either
- * both extents are uninitialized, or both aren't. Thus we
- * need to check only one of them here.
+ * Try to see whether we should rather test the extent on
+ * right from ex, or from the left of ex. This is because
+ * ext4_ext_find_extent() can return either extent on the
+ * left, or on the right from the searched position. This
+ * will make merging more effective.
*/
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
- ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ if (ex < EXT_LAST_EXTENT(eh) &&
+ (le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex) <
+ le32_to_cpu(newext->ee_block))) {
+ ex += 1;
+ goto prepend;
+ } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+ (le32_to_cpu(newext->ee_block) +
+ ext4_ext_get_actual_len(newext) <
+ le32_to_cpu(ex->ee_block)))
+ ex -= 1;
+
+ /* Try to append newex to the ex */
+ if (ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ /*
+ * ext4_can_extents_be_merged should have checked
+ * that either both extents are uninitialized, or
+ * both aren't. Thus we need to check only one of
+ * them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
- eh = path[depth].p_hdr;
- nearex = ex;
- goto merge;
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+prepend:
+ /* Try to prepend newex to the ex */
+ if (ext4_can_extents_be_merged(inode, newext, ex)) {
+ ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ /*
+ * ext4_can_extents_be_merged should have checked
+ * that either both extents are uninitialized, or
+ * both aren't. Thus we need to check only one of
+ * them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_block = newext->ee_block;
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
}
depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
- flags = EXT4_MB_USE_ROOT_BLOCKS;
+ if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ flags = EXT4_MB_USE_RESERVED;
err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
if (err)
goto cleanup;
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
@@ -2667,12 +2726,14 @@ again:
/*
* Split the extent in two so that 'end' is the last
- * block in the first new extent
+ * block in the first new extent. Also we should not
+ * fail removing space due to ENOSPC so try to use
+ * reserved block if that happens.
*/
err = ext4_split_extent_at(handle, inode, path,
- end + 1, split_flag,
- EXT4_GET_BLOCKS_PRE_IO |
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+ end + 1, split_flag,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
if (err < 0)
goto out;
@@ -3147,35 +3208,35 @@ out:
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path,
+ int flags)
{
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
- struct ext4_extent *ex;
+ struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
- unsigned int ee_len, depth;
- int allocated, max_zeroout = 0;
+ unsigned int ee_len, depth, map_len = map->m_len;
+ int allocated = 0, max_zeroout = 0;
int err = 0;
int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ (unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
- if (eof_block < map->m_lblk + map->m_len)
- eof_block = map->m_lblk + map->m_len;
+ if (eof_block < map->m_lblk + map_len)
+ eof_block = map->m_lblk + map_len;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
zero_ex.ee_len = 0;
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
/*
* Attempt to transfer newly initialized blocks from the currently
- * uninitialized extent to its left neighbor. This is much cheaper
+ * uninitialized extent to its neighbor. This is much cheaper
* than an insertion followed by a merge as those involve costly
- * memmove() calls. This is the common case in steady state for
- * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
- * writes.
+ * memmove() calls. Transferring to the left is the common case in
+ * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+ * followed by append writes.
*
* Limitations of the current logic:
- * - L1: we only deal with writes at the start of the extent.
- * The approach could be extended to writes at the end
- * of the extent but this scenario was deemed less common.
- * - L2: we do not deal with writes covering the whole extent.
+ * - L1: we do not deal with writes covering the whole extent.
* This would require removing the extent if the transfer
* is possible.
- * - L3: we only attempt to merge with an extent stored in the
+ * - L2: we only attempt to merge with an extent stored in the
* same extent tree node.
*/
- if ((map->m_lblk == ee_block) && /*L1*/
- (map->m_len < ee_len) && /*L2*/
- (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
- struct ext4_extent *prev_ex;
+ if ((map->m_lblk == ee_block) &&
+ /* See if we can merge left */
+ (map_len < ee_len) && /*L1*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
ext4_lblk_t prev_lblk;
ext4_fsblk_t prev_pblk, ee_pblk;
- unsigned int prev_len, write_len;
+ unsigned int prev_len;
- prev_ex = ex - 1;
- prev_lblk = le32_to_cpu(prev_ex->ee_block);
- prev_len = ext4_ext_get_actual_len(prev_ex);
- prev_pblk = ext4_ext_pblock(prev_ex);
+ abut_ex = ex - 1;
+ prev_lblk = le32_to_cpu(abut_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(abut_ex);
+ prev_pblk = ext4_ext_pblock(abut_ex);
ee_pblk = ext4_ext_pblock(ex);
- write_len = map->m_len;
/*
- * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
* upon those conditions:
- * - C1: prev_ex is initialized,
- * - C2: prev_ex is logically abutting ex,
- * - C3: prev_ex is physically abutting ex,
- * - C4: prev_ex can receive the additional blocks without
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
((prev_lblk + prev_len) == ee_block) && /*C2*/
((prev_pblk + prev_len) == ee_pblk) && /*C3*/
- (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
- map, ex, prev_ex);
+ map, ex, abut_ex);
- /* Shift the start of ex by 'write_len' blocks */
- ex->ee_block = cpu_to_le32(ee_block + write_len);
- ext4_ext_store_pblock(ex, ee_pblk + write_len);
- ex->ee_len = cpu_to_le16(ee_len - write_len);
+ /* Shift the start of ex by 'map_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + map_len);
+ ext4_ext_store_pblock(ex, ee_pblk + map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
ext4_ext_mark_uninitialized(ex); /* Restore the flag */
- /* Extend prev_ex by 'write_len' blocks */
- prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
- /* Mark the block containing both extents as dirty */
- ext4_ext_dirty(handle, inode, path + depth);
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
+ }
+ } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+ (map_len < ee_len) && /*L1*/
+ ex < EXT_LAST_EXTENT(eh)) { /*L2*/
+ /* See if we can merge right */
+ ext4_lblk_t next_lblk;
+ ext4_fsblk_t next_pblk, ee_pblk;
+ unsigned int next_len;
+
+ abut_ex = ex + 1;
+ next_lblk = le32_to_cpu(abut_ex->ee_block);
+ next_len = ext4_ext_get_actual_len(abut_ex);
+ next_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
- /* Update path to point to the right extent */
- path[depth].p_ext = prev_ex;
+ /*
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ ((map->m_lblk + map_len) == next_lblk) && /*C2*/
+ ((ee_pblk + ee_len) == next_pblk) && /*C3*/
+ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
+
+ /* Shift the start of abut_ex by 'map_len' blocks */
+ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+ ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = write_len;
- goto out;
+ allocated = map_len;
}
}
+ if (allocated) {
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = abut_ex;
+ goto out;
+ } else
+ allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block);
/*
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, 0);
+ &split_map, split_flag, flags);
if (allocated < 0)
err = allocated;
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
+ /*
+ * When writing into uninitialized space, we should not fail to
+ * allocate metadata blocks for the new extent block if needed.
+ */
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
allocated, newblock);
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out:
@@ -4257,48 +4368,13 @@ out3:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
- handle_t *handle;
- loff_t page_len;
int err = 0;
/*
- * finish any pending end_io work so we won't run the risk of
- * converting any truncated blocks to initialized later
- */
- ext4_flush_unwritten_io(inode);
-
- /*
- * probably first extent we're gonna free will be last in block
- */
- err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
- if (IS_ERR(handle))
- return;
-
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out_stop;
- }
-
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
-
- down_write(&EXT4_I(inode)->i_data_sem);
-
- ext4_discard_preallocations(inode);
-
- /*
* TODO: optimization is possible here.
* Probably we need not scan at all,
* because page truncation is enough.
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode)
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-
- /* In a multi-transaction truncate, we only make the final
- * transaction synchronous.
- */
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out_stop:
- /*
- * If this was a simple ftruncate() and the file will remain alive,
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
- */
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
-
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
}
static void ext4_falloc_update_inode(struct inode *inode,
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode: The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
- int credits, err = 0;
-
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
- mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- err = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- err = -ETXTBSY;
- goto out_mutex;
- }
-
- /* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
-
- /*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
- */
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
- offset;
- }
-
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- err = ext4_flush_unwritten_io(inode);
- if (err)
- goto out_dio;
- inode_dio_wait(inode);
-
- credits = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto out_dio;
- }
-
-
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the buffer
- * heads for the block aligned regions of the page that were
- * completely zeroed.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained within a page
- * just zero out and unmap the middle of that page
- */
- err = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
-
- if (err)
- goto out;
- } else {
- /*
- * zero out and unmap the partial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (err)
- goto out;
- }
-
- /*
- * zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- /*
- * If i_size is contained in the last page, we need to
- * unmap and zero the partial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
-
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out;
- }
- }
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- /* If there are no blocks to remove, return now */
- if (first_block >= stop_block)
- goto out;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- err = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-
- ext4_discard_preallocations(inode);
-
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
-out_dio:
- ext4_inode_resume_unlocked_dio(inode);
-out_mutex:
- mutex_unlock(&inode->i_mutex);
- return err;
-}
-
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..e0ba8a408def 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
- jbd2_log_start_commit(journal, commit_tid);
- ret = jbd2_log_wait_commit(journal, commit_tid);
+ ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6c5bb8d993fe..00a818d67b54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
+ /*
+ * Initalize owners and quota early so that we don't have to account
+ * for quota initialization worst case in standard inode creating
+ * transaction
+ */
+ if (owner) {
+ inode->i_mode = mode;
+ i_uid_write(inode, owner[0]);
+ i_gid_write(inode, owner[1]);
+ } else if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ } else
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -697,7 +714,7 @@ got_group:
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
- goto fail;
+ goto out;
/*
* Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
- goto fail;
+ goto out;
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ repeat_in_this_group:
handle_type, nblocks);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto fail;
+ ext4_std_error(sb, err);
+ goto out;
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ repeat_in_this_group:
got:
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ got:
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
brelse(block_bitmap_bh);
- goto fail;
+ ext4_std_error(sb, err);
+ goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ got:
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ got:
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ got:
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
- if (owner) {
- inode->i_mode = mode;
- i_uid_write(inode, owner[0]);
- i_gid_write(inode, owner[1]);
- } else if (test_opt(sb, GRPID)) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = dir->i_gid;
- } else
- inode_init_owner(inode, dir, mode);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ got:
* twice.
*/
err = -EIO;
- goto fail;
+ ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+ inode->i_ino);
+ goto out;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ got:
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
__u32 csum;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ got:
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
- dquot_initialize(inode);
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
@@ -952,24 +971,17 @@ got:
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
- goto really_out;
-fail:
- ext4_std_error(sb, err);
-out:
- iput(inode);
- ret = ERR_PTR(err);
-really_out:
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
-
fail_drop:
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
clear_nlink(inode);
unlock_new_inode(inode);
+out:
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a04183127ef0..98be6f697463 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
}
/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @handle: handle for this transaction
- * @inode: inode which needs allocated blocks
- * @iblock: the logical block to start allocated at
- * @goal: preferred physical block of allocation
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- * @blks: number of desired blocks
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @err: on return it will store the error code
- *
- * This function will return the number of blocks allocated as
- * requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- struct ext4_allocation_request ar;
- int target, i;
- unsigned long count = 0, blk_allocated = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
-
- /*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
- */
- /* first we try to allocate the indirect blocks */
- target = indirect_blks;
- while (target > 0) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode, goal,
- 0, &count, err);
- if (*err)
- goto failed_out;
-
- if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + count %lu > %d!",
- current_block, count,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
- if (count > 0) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- WARN(1, KERN_INFO "%s returned more blocks than "
- "requested\n", __func__);
- break;
- }
- }
-
- target = blks - count ;
- blk_allocated = count;
- if (!target)
- goto allocated;
- /* Now allocate data blocks */
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = target;
- ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
- /* enable in-core preallocation only for regular files */
- ar.flags = EXT4_MB_HINT_DATA;
-
- current_block = ext4_mb_new_blocks(handle, &ar, err);
- if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + ar.len %d > %d!",
- current_block, ar.len,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- if (*err && (target == blks)) {
- /*
- * if the allocation failed and we didn't allocate
- * any blocks before
- */
- goto failed_out;
- }
- if (!*err) {
- if (target == blks) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- }
- blk_allocated += ar.len;
- }
-allocated:
- /* total number of blocks allocated for direct blocks */
- ret = blk_allocated;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
- return ret;
-}
-
-/**
* ext4_alloc_branch - allocate and set up a chain of blocks.
* @handle: handle for this transaction
* @inode: owner
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
int *blks, ext4_fsblk_t goal,
ext4_lblk_t *offsets, Indirect *branch)
{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
-
- num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
+ struct ext4_allocation_request ar;
+ struct buffer_head * bh;
+ ext4_fsblk_t b, new_blocks[4];
+ __le32 *p;
+ int i, j, err, len = 1;
- branch[0].key = cpu_to_le32(new_blocks[0]);
/*
- * metadata blocks and data blocks are allocated.
+ * Set up for the direct block allocation
*/
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.len = *blks;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ for (i = 0; i <= indirect_blks; i++) {
+ if (i == indirect_blks) {
+ ar.goal = goal;
+ new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+ } else
+ goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+ goal, 0, NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+ }
+ branch[i].key = cpu_to_le32(new_blocks[i]);
+ if (i == 0)
+ continue;
+
+ bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
if (unlikely(!bh)) {
err = -ENOMEM;
goto failed;
}
-
- branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
- /* Don't brelse(bh) here; it's done in
- * ext4_journal_forget() below */
unlock_buffer(bh);
goto failed;
}
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if (n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i = 1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
+ memset(bh->b_data, 0, bh->b_size);
+ p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+ b = new_blocks[i];
+
+ if (i == indirect_blks)
+ len = ar.len;
+ for (j = 0; j < len; j++)
+ *p++ = cpu_to_le32(b++);
+
BUFFER_TRACE(bh, "marking uptodate");
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
if (err)
goto failed;
}
- *blks = num;
- return err;
+ *blks = ar.len;
+ return 0;
failed:
- /* Allocation failed, free what we already allocated */
- ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
- for (i = 1; i <= n ; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
- EXT4_FREE_BLOCKS_FORGET);
+ for (; i >= 0; i--) {
+ if (i != indirect_blks && branch[i].bh)
+ ext4_forget(handle, 1, inode, branch[i].bh,
+ branch[i].bh->b_blocknr);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+ (i == indirect_blks) ? ar.len : 1, 0);
}
- for (i = n+1; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
-
return err;
}
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
+ * Try to extend this transaction for the purposes of truncation. If
* extend fails, we need to propagate the failure up and restart the
* transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
- handle_t *result;
-
- result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
-
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
*
* Returns 0 if we managed to create more room. If we can't create more
* room, and the transaction must be restarted we return 1.
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
-void ext4_ind_truncate(struct inode *inode)
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
ext4_lblk_t offsets[4];
Indirect chain[4];
Indirect *partial;
__le32 nr = 0;
int n = 0;
ext4_lblk_t last_block, max_block;
- loff_t page_len;
unsigned blocksize = inode->i_sb->s_blocksize;
- int err;
-
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- return; /* AKPM: return what? */
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out_stop;
- }
-
if (last_block != max_block) {
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
- goto out_stop; /* error */
+ return;
}
- /*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
- *
- * Implication: the file must always be in a sane, consistent
- * truncatable state while each transaction commits.
- */
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
-
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
- ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
/*
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
* It is unnecessary to free any data blocks if last_block is
* equal to the indirect block limit.
*/
- goto out_unlock;
+ return;
} else if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1301,6 @@ do_indirects:
case EXT4_TIND_BLOCK:
;
}
-
-out_unlock:
- up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
-
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-out_stop:
- /*
- * If this was a simple ftruncate(), and the file will remain alive
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
- */
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
-
- ext4_journal_stop(handle);
- trace_ext4_truncate_exit(inode);
}
static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1354,8 @@ err:
return ret;
}
-static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t first, ext4_lblk_t stop)
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
{
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int level, ret = 0;
@@ -1604,157 +1389,3 @@ err:
return ret;
}
-int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- handle_t *handle = NULL;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
- int err = 0;
-
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
- if (err)
- return err;
- }
-
- mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- err = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- err = -ETXTBSY;
- goto out_mutex;
- }
-
- /* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
-
- /*
- * If the hole extents beyond i_size, set the hole
- * to end after the page that contains i_size
- */
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
- offset;
- }
-
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
-
- /* Wait all existing dio works, newcomers will block on i_mutex */
- inode_dio_wait(inode);
-
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- goto out_mutex;
-
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the buffer
- * heads for the block aligned regions of the page that were
- * completely zerod.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained within a page
- * just zero out and unmap the middle of that page
- */
- err = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
- if (err)
- goto out;
- } else {
- /*
- * Zero out and unmap the paritial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (err)
- goto out;
- }
-
- /*
- * Zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- /*
- * If i_size contained in the last page, we need to
- * unmap and zero the paritial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- if (first_block >= stop_block)
- goto out;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- err = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
-
- ext4_discard_preallocations(inode);
-
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
-
-out_mutex:
- mutex_unlock(&inode->i_mutex);
-
- return err;
-}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0fd1a123f7d..3e2bf873e8a8 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE 4
+#define EXT4_INLINE_DOTDOT_OFFSET 2
+#define EXT4_INLINE_DOTDOT_SIZE 4
int ext4_get_inline_size(struct inode *inode)
{
@@ -1289,6 +1290,120 @@ out:
return ret;
}
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data)
+{
+ int err = 0, count = 0;
+ unsigned int parent_ino;
+ int pos;
+ struct ext4_dir_entry_2 *de;
+ struct inode *inode = file_inode(dir_file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ struct ext4_dir_entry_2 fake;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ pos = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ while (pos < inline_size) {
+ /*
+ * As inlined dir doesn't store any information about '.' and
+ * only the inode number of '..' is stored, we have to handle
+ * them differently.
+ */
+ if (pos == 0) {
+ fake.inode = cpu_to_le32(inode->i_ino);
+ fake.name_len = 1;
+ strcpy(fake.name, ".");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_OFFSET;
+ } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+ fake.inode = cpu_to_le32(parent_ino);
+ fake.name_len = 2;
+ strcpy(fake.name, "..");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+ pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ if (ext4_check_dir_entry(inode, dir_file, de,
+ iloc.bh, dir_buf,
+ inline_size, pos)) {
+ ret = count;
+ goto out;
+ }
+ }
+
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ count++;
+ }
+ ret = count;
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
int ext4_read_inline_dir(struct file *filp,
void *dirent, filldir_t filldir,
int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
+ int dotdot_offset, dotdot_size, extra_offset, extra_size;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
sb = inode->i_sb;
stored = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ offset = filp->f_pos;
- while (!error && !stored && filp->f_pos < inode->i_size) {
+ /*
+ * dotdot_offset and dotdot_size is the real offset and
+ * size for ".." and "." if the dir is block based while
+ * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+ * So we will use extra_offset and extra_size to indicate them
+ * during the inline dir iteration.
+ */
+ dotdot_offset = EXT4_DIR_REC_LEN(1);
+ dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+ extra_size = extra_offset + inline_size;
+
+ while (!error && !stored && filp->f_pos < extra_size) {
revalidate:
/*
* If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ revalidate:
* dir to make sure.
*/
if (filp->f_version != inode->i_version) {
- for (i = 0;
- i < inode->i_size && i < offset;) {
+ for (i = 0; i < extra_size && i < offset;) {
+ /*
+ * "." is with offset 0 and
+ * ".." is dotdot_offset.
+ */
if (!i) {
- /* skip "." and ".." if needed. */
- i += EXT4_INLINE_DOTDOT_SIZE;
+ i = dotdot_offset;
+ continue;
+ } else if (i == dotdot_offset) {
+ i = dotdot_size;
continue;
}
+ /* for other entry, the real offset in
+ * the buf has to be tuned accordingly.
+ */
de = (struct ext4_dir_entry_2 *)
- (dir_buf + i);
+ (dir_buf + i - extra_offset);
/* It's too expensive to do a full
* dirent test each time round this
* loop, but we do have to test at
@@ -1356,43 +1493,47 @@ revalidate:
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
- inline_size) < EXT4_DIR_REC_LEN(1))
+ extra_size) < EXT4_DIR_REC_LEN(1))
break;
i += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
+ extra_size);
}
offset = i;
filp->f_pos = offset;
filp->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size) {
+ while (!error && filp->f_pos < extra_size) {
if (filp->f_pos == 0) {
error = filldir(dirent, ".", 1, 0, inode->i_ino,
DT_DIR);
if (error)
break;
stored++;
+ filp->f_pos = dotdot_offset;
+ continue;
+ }
- error = filldir(dirent, "..", 2, 0, parent_ino,
- DT_DIR);
+ if (filp->f_pos == dotdot_offset) {
+ error = filldir(dirent, "..", 2,
+ dotdot_offset,
+ parent_ino, DT_DIR);
if (error)
break;
stored++;
- filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+ filp->f_pos = dotdot_size;
continue;
}
- de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + filp->f_pos - extra_offset);
if (ext4_check_dir_entry(inode, filp, de,
iloc.bh, dir_buf,
- inline_size, offset)) {
+ extra_size, filp->f_pos)) {
ret = stored;
goto out;
}
- offset += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
@@ -1415,9 +1556,8 @@ revalidate:
stored++;
}
filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
+ extra_size);
}
- offset = 0;
}
out:
kfree(dir_buf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3a5213bc73e..793d44b84d7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
__u16 csum_hi = 0;
__u32 csum;
- csum_lo = raw->i_checksum_lo;
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
raw->i_checksum_lo = 0;
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum_hi = raw->i_checksum_hi;
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
raw->i_checksum_hi = 0;
}
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
EXT4_INODE_SIZE(inode->i_sb));
- raw->i_checksum_lo = csum_lo;
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
- raw->i_checksum_hi = csum_hi;
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
return csum;
}
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
- jbd2_log_start_commit(journal, commit_tid);
- jbd2_log_wait_commit(journal, commit_tid);
+ jbd2_complete_transaction(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
@@ -1081,20 +1080,42 @@ retry_journal:
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_handle_dirty_metadata(handle, NULL, bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int i_size_changed = 0;
- struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ int i_size_changed = 0;
+
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
if (ext4_has_inline_data(inode))
copied = ext4_write_inline_data_end(inode, pos, len,
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file,
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
+ * cannot change under us because we hole i_mutex.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file,
i_size_changed = 1;
}
- if (pos + copied > EXT4_I(inode)->i_disksize) {
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
+ * but greater than i_disksize. (hint delalloc)
*/
ext4_update_i_disksize(inode, (pos + copied));
i_size_changed = 1;
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_ordered_write_end(inode, pos, len, copied);
- ret = ext4_jbd2_file_inode(handle, inode);
-
- if (ret == 0) {
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
- /* if we have allocated more blocks and copied
- * less. We will have blocks allocated outside
- * inode->i_size. So truncate them
- */
- ext4_orphan_add(handle, inode);
- if (ret2 < 0)
- ret = ret2;
- } else {
- unlock_page(page);
- page_cache_release(page);
- }
-
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
-
- if (pos + len > inode->i_size) {
- ext4_truncate_failed_write(inode);
- /*
- * If truncate failed early the inode might still be
- * on the orphan list; we need to make sure the inode
- * is removed from the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
-
-
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_writeback_write_end(inode, pos, len, copied);
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
+ if (copied < 0)
+ ret = copied;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-
- if (ret2 < 0)
- ret = ret2;
-
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
struct ext4_io_submit io_submit;
BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ ext4_io_submit_init(&io_submit, mpd->wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end)
+ return -ENOMEM;
/*
* We need to start from the first_page to the next_page - 1
* to make sure we also write the mapped dirty buffer_heads.
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
pagevec_release(&pvec);
}
ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
EXT4_C2B(EXT4_SB(inode->i_sb),
- ext4_count_free_clusters(inode->i_sb)));
+ ext4_count_free_clusters(sb)));
ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
- (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ (long long) EXT4_C2B(EXT4_SB(sb),
percpu_counter_sum(&sbi->s_freeclusters_counter)));
ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
- (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ (long long) EXT4_C2B(EXT4_SB(sb),
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
ext4_msg(sb, KERN_CRIT, "Block reservation details");
ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
- EXT4_I(inode)->i_reserved_data_blocks);
+ ei->i_reserved_data_blocks);
ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
return;
}
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
*/
map.m_lblk = next;
map.m_len = max_blocks;
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+ /*
+ * We're in delalloc path and it is possible that we're going to
+ * need more metadata blocks than previously reserved. However
+ * we must not fail because we're in writeback and there is
+ * nothing we can do about it so it might result in data loss.
+ * So use reserved blocks to allocate metadata if possible.
+ */
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
if (ext4_should_dioread_nolock(mpd->inode))
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
if (blks < 0) {
struct super_block *sb = mpd->inode->i_sb;
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- memset(&io_submit, 0, sizeof(io_submit));
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ return -ENOMEM;
+ }
ret = ext4_bio_write_page(&io_submit, page, len, wbc);
ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
@@ -2661,7 +2634,7 @@ out_writepages:
static int ext4_nonda_switch(struct super_block *sb)
{
- s64 free_blocks, dirty_blocks;
+ s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb)
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
*/
- free_blocks = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
/*
* Start pushing delalloc when 1/2 of free blocks are dirty.
*/
- if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
- if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
/*
* free block count is less than 150% of dirty blocks
* or free blocks is less than watermark
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file,
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
- if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- switch (ext4_inode_journal_mode(inode)) {
- case EXT4_INODE_ORDERED_DATA_MODE:
- return ext4_ordered_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- case EXT4_INODE_WRITEBACK_DATA_MODE:
- return ext4_writeback_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- default:
- BUG();
- }
- }
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end) {
+ inode_dio_done(inode);
+ if (is_async)
+ aio_complete(iocb, ret, 0);
+ return;
+ }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
-
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
-out:
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
io_end->offset = offset;
io_end->size = size;
if (is_async) {
io_end->iocb = iocb;
io_end->result = ret;
}
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
/*
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) {
ret = -ENOMEM;
goto retake_lock;
}
io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
/*
* we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
dio_flags);
- if (iocb->private)
- ext4_inode_aio_set(inode, NULL);
/*
- * The io_end structure takes a reference to the inode, that
- * structure needs to be destroyed and the reference to the
- * inode need to be dropped, when IO is complete, even with 0
- * byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will
- * be destroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since VFS
- * direct IO won't invoke the end_io call back function, we
- * need to free the end_io structure here.
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * In case of error or no write ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED) {
+ WARN_ON(iocb->private != io_end);
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
+ }
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
+static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
.write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writepage,
- .write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
+ .write_end = ext4_write_end,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
case EXT4_INODE_ORDERED_DATA_MODE:
- if (test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else
- inode->i_mapping->a_ops = &ext4_ordered_aops;
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_WRITEBACK_DATA_MODE:
- if (test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else
- inode->i_mapping->a_ops = &ext4_writeback_aops;
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
- break;
+ return;
default:
BUG();
}
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_aops;
}
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode)
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
+
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return ext4_ind_punch_hole(file, offset, length);
-
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+ if (EXT4_SB(sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
return -EOPNOTSUPP;
}
trace_ext4_punch_hole(inode, offset, length);
- return ext4_ext_punch_hole(file, offset, length);
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ ret = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ goto out_mutex;
+ }
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_pagecache_range(inode, first_page_offset,
+ last_page_offset - 1);
+ }
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ ret = ext4_flush_unwritten_io(inode);
+ if (ret)
+ goto out_dio;
+ inode_dio_wait(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
+ }
+
+ /*
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the
+ * buffer heads for the block aligned regions of the page that
+ * were completely zeroed.
+ */
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained
+ * within a page just zero out and unmap the middle of
+ * that page
+ */
+ ret = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+
+ if (ret)
+ goto out_stop;
+ } else {
+ /*
+ * zero out and unmap the partial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ ret = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (ret)
+ goto out_stop;
+ }
+
+ /*
+ * zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ ret = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (ret)
+ goto out_stop;
+ }
+ }
+
+ /*
+ * If i_size is contained in the last page, we need to
+ * unmap and zero the partial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ if (page_len > 0) {
+ ret = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (ret)
+ goto out_stop;
+ }
+ }
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
+
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
*/
void ext4_truncate(struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int credits;
+ handle_t *handle;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t page_len;
+
+ /*
+ * There is a possibility that we're either freeing the inode
+ * or it completely new indode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
+ */
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode)
return;
}
+ /*
+ * finish any pending end_io work so we won't run the risk of
+ * converting any truncated blocks to initialized later
+ */
+ ext4_flush_unwritten_io(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
+
+ if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ if (ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0))
+ goto out_stop;
+ }
+
+ /*
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
+ *
+ * Implication: the file must always be in a sane, consistent
+ * truncatable state while each transaction commits.
+ */
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ ext4_discard_preallocations(inode);
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ext4_ext_truncate(inode);
+ ext4_ext_truncate(handle, inode);
else
- ext4_ind_truncate(inode);
+ ext4_ind_truncate(handle, inode);
+
+ up_write(&ei->i_data_sem);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+out_stop:
+ /*
+ * If this was a simple ftruncate() and the file will remain alive,
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
trace_ext4_truncate_exit(inode);
}
@@ -3821,13 +4011,14 @@ make_io:
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
table = ext4_inode_table(sb, gdp);
/* s_inode_readahead_blks is always a power of 2 */
- b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
if (table > b)
b = table;
- end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ end = b + ra_blks;
num = EXT4_INODES_PER_GROUP(sb);
if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
ret = -ESTALE;
goto bad_inode;
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
} else {
ret = -EIO;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 721f4d33e148..9491ac0590f7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a: pointer to first memory area
+ * @b: pointer to second memory area
+ * @len: number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+ unsigned char *ap, *bp;
+ unsigned char tmp;
+
+ ap = (unsigned char *)a;
+ bp = (unsigned char *)b;
+ while (len-- > 0) {
+ tmp = *ap;
+ *ap = *bp;
+ *bp = tmp;
+ ap++;
+ bp++;
+ }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1: pointer to first inode
+ * @inode2: pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+ loff_t isize;
+ struct ext4_inode_info *ei1;
+ struct ext4_inode_info *ei2;
+
+ ei1 = EXT4_I(inode1);
+ ei2 = EXT4_I(inode2);
+
+ memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+ memswap(&inode1->i_version, &inode2->i_version,
+ sizeof(inode1->i_version));
+ memswap(&inode1->i_blocks, &inode2->i_blocks,
+ sizeof(inode1->i_blocks));
+ memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+ memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+ memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+ memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+ memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+ memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+ i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb: the super block of the filesystem
+ * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+ struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+ struct inode *inode_bl;
+ struct ext4_inode_info *ei;
+ struct ext4_inode_info *ei_bl;
+ struct ext4_sb_info *sbi;
+
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+ err = -EINVAL;
+ goto swap_boot_out;
+ }
+
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto swap_boot_out;
+ }
+
+ sbi = EXT4_SB(sb);
+ ei = EXT4_I(inode);
+
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ if (IS_ERR(inode_bl)) {
+ err = PTR_ERR(inode_bl);
+ goto swap_boot_out;
+ }
+ ei_bl = EXT4_I(inode_bl);
+
+ filemap_flush(inode->i_mapping);
+ filemap_flush(inode_bl->i_mapping);
+
+ /* Protect orig inodes against a truncate and make sure,
+ * that only 1 swap_inode_boot_loader is running. */
+ ext4_inode_double_lock(inode, inode_bl);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ ext4_inode_block_unlocked_dio(inode_bl);
+ inode_dio_wait(inode);
+ inode_dio_wait(inode_bl);
+
+ handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+ if (IS_ERR(handle)) {
+ err = -EINVAL;
+ goto swap_boot_out;
+ }
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(inode, inode_bl);
+
+ if (inode_bl->i_nlink == 0) {
+ /* this inode has never been used as a BOOT_LOADER */
+ set_nlink(inode_bl, 1);
+ i_uid_write(inode_bl, 0);
+ i_gid_write(inode_bl, 0);
+ inode_bl->i_flags = 0;
+ ei_bl->i_flags = 0;
+ inode_bl->i_version = 1;
+ i_size_write(inode_bl, 0);
+ inode_bl->i_mode = S_IFREG;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode_bl);
+ } else
+ memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+ }
+
+ swap_inode_data(inode, inode_bl);
+
+ inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ inode_bl->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ext4_discard_preallocations(inode);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ } else {
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+
+ ext4_journal_stop(handle);
+
+ ext4_double_up_write_data_sem(inode, inode_bl);
+
+ ext4_inode_resume_unlocked_dio(inode);
+ ext4_inode_resume_unlocked_dio(inode_bl);
+
+ ext4_inode_double_unlock(inode, inode_bl);
+
+ iput(inode_bl);
+
+swap_boot_out:
+ return err;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
- if (oldflags & EXT4_EXTENTS_FL) {
- /* We don't support clearning extent flags */
- if (!(flags & EXT4_EXTENTS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (flags & EXT4_EXTENTS_FL) {
- /* migrate the file */
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
- flags &= ~EXT4_EXTENTS_FL;
- }
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
- if (migrate)
- err = ext4_ext_migrate(inode);
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
return err;
}
+ case EXT4_IOC_SWAP_BOOT:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ return swap_inode_boot_loader(sb, inode);
+
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..a11ea4d6164c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
+}
+
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
- int group;
-
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
struct page *page;
int ret = 0;
+ might_sleep();
mb_debug(1, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
+ might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+ int first, int count)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
+ BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, e4b->bd_bitmap);
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, e4b->bd_bitmap);
- if (block && max)
- e4b->bd_info->bb_fragments--;
- else if (!block && !max)
- e4b->bd_info->bb_fragments++;
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
+ if (unlikely(block != -1)) {
+ ext4_fsblk_t blocknr;
- if (!mb_test_bit(block, e4b->bd_bitmap)) {
- ext4_fsblk_t blocknr;
-
- blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += EXT4_C2B(EXT4_SB(sb), block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u)", block);
- }
- mb_clear_bit(block, e4b->bd_bitmap);
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u)", block);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
- if (!buddy2)
- break;
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
+ e4b->bd_info->bb_fragments--;
+ else if (!left_is_free && !right_is_free)
+ e4b->bd_info->bb_fragments++;
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3807,7 +3918,7 @@ repeat:
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(entry, new_entry)) {
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_start_cluster = entry->efd_start_cluster;
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(new_entry, entry)) {
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
+ might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 480acf4a085f..49e8bdff9163 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
-
}
int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
return retval;
}
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent *ex;
+ unsigned int i, len;
+ ext4_fsblk_t blk;
+ handle_t *handle;
+ int ret;
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return -EOPNOTSUPP;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_check_inode(inode);
+ if (ret)
+ goto errout;
+
+ eh = ext_inode_hdr(inode);
+ ex = EXT_FIRST_EXTENT(eh);
+ if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+ eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ if (eh->eh_entries == 0)
+ blk = len = 0;
+ else {
+ len = le16_to_cpu(ex->ee_len);
+ blk = ext4_ext_pblock(ex);
+ if (len > EXT4_NDIR_BLOCKS) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ for (i=0; i < len; i++)
+ ei->i_data[i] = cpu_to_le32(blk++);
+ ext4_mark_inode_dirty(handle, inode);
+errout:
+ ext4_journal_stop(handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f9b551561d2c..214461e42a05 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
#include "ext4.h"
/* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE_SYNC, bh);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(READ_SYNC, *bh);
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 33e1c086858b..3dcbf364022f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ * of i_data_sem
*
* Acquire write lock of i_data_sem of the two inodes
*/
-static void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
}
/**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
end_ext, eh, range_to_move);
- if (depth) {
- ret = ext4_handle_dirty_metadata(handle, orig_inode,
- orig_path->p_bh);
- if (ret)
- return ret;
- } else {
- ret = ext4_mark_inode_dirty(handle, orig_inode);
- if (ret < 0)
- return ret;
- }
-
- return 0;
+ return ext4_ext_dirty(handle, orig_inode, orig_path);
}
/**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
donor_off += dext_alen;
orig_off += dext_alen;
+ BUG_ON(replaced_count > count);
/* Already moved the expected blocks */
if (replaced_count >= count)
break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page_cache_release(page[0]);
return -ENOMEM;
}
-
+ /*
+ * grab_cache_page_write_begin() may not wait on page's writeback if
+ * BDI not demand that. But it is reasonable to be very conservative
+ * here and explicitly wait on page's writeback
+ */
+ wait_on_page_writeback(page[0]);
+ wait_on_page_writeback(page[1]);
if (inode1 > inode2) {
struct page *tmp;
tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
- int err = 0;
err = ext4_get_block(inode, block, bh, 0);
if (err) {
SetPageError(page);
@@ -976,7 +973,7 @@ again:
* necessary, just swap data blocks between orig and donor.
*/
if (uninit) {
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ again:
goto drop_data_sem;
if (!uninit) {
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ again:
donor_inode, orig_blk_offset,
block_len_in_page, err);
drop_data_sem:
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
}
data_copy:
@@ -1033,7 +1030,7 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, from + replaced_size,
+ *err = __block_write_begin(pagep[0], from, replaced_size,
ext4_get_block);
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ repair_branches:
* Extents are swapped already, but we are not able to copy data.
* Try to swap extents to it's original places
*/
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
orig_blk_offset,
block_len_in_page, &err2);
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
"Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
*
* Lock two inodes' i_mutex
*/
-static void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
BUG_ON(inode1 == inode2);
if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
}
/**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
*
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
*/
-static void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ ext4_inode_double_lock(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
@@ -1538,10 +1535,10 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
- mext_inode_double_unlock(orig_inode, donor_inode);
+ ext4_inode_double_unlock(orig_inode, donor_inode);
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3825d6aa8336..6653fc35ecb7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- __u32 csum, old_csum;
+ __u32 csum;
+ __le32 save_csum;
int size;
size = count_offset + (count * sizeof(struct dx_entry));
- old_csum = t->dt_checksum;
+ save_csum = t->dt_checksum;
t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
- t->dt_checksum = old_csum;
+ t->dt_checksum = save_csum;
return cpu_to_le32(csum);
}
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ count = htree_inlinedir_to_tree(dir_file, dir, 0,
+ &hinfo, start_hash,
+ start_minor_hash,
+ &has_inline_data);
+ if (has_inline_data) {
+ *next_hash = ~0;
+ return count;
+ }
+ }
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
- struct ext4_dir_entry_2 *de,
- umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
/*
* Move count entries from end of map between two memory locations.
* Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
&dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
retry:
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 047a6de04a0a..5929cd0baa20 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -29,25 +29,19 @@
#include "xattr.h"
#include "acl.h"
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
+static struct kmem_cache *io_end_cachep;
int __init ext4_init_pageio(void)
{
- io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
- if (io_page_cachep == NULL)
- return -ENOMEM;
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
- if (io_end_cachep == NULL) {
- kmem_cache_destroy(io_page_cachep);
+ if (io_end_cachep == NULL)
return -ENOMEM;
- }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
- kmem_cache_destroy(io_page_cachep);
}
/*
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
}
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_page->p_count)) {
- end_page_writeback(io_page->p_page);
- put_page(io_page->p_page);
- kmem_cache_free(io_page_cachep, io_page);
- }
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
+ if (io_end->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(io_end->inode);
+ if (io_end->iocb)
+ aio_complete(io_end->iocb, io_end->result, 0);
+ kmem_cache_free(io_end_cachep, io_end);
}
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- int i;
-
- BUG_ON(!io);
- BUG_ON(!list_empty(&io->list));
- BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+ struct inode *inode = io_end->inode;
- for (i = 0; i < io->num_io_pages; i++)
- put_io_page(io->pages[i]);
- io->num_io_pages = 0;
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io->inode));
- kmem_cache_free(io_end_cachep, io);
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
}
/* check a range of space and convert unwritten extents to written. */
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
return ret;
}
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
}
/* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
- ext4_free_io_end(io);
}
return ret;
}
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
}
return io;
}
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->inode,
+ io_end->offset, io_end->size);
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
+}
+
/*
* Print an buffer I/O error compatible with the fs/buffer.c. This
* provides compatibility with dmesg scrapers that look for a specific
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
ext4_io_end_t *io_end = bio->bi_private;
struct inode *inode;
int i;
+ int blocksize;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
+ inode = io_end->inode;
+ blocksize = 1 << inode->i_blkbits;
bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- bio_put(bio);
-
- for (i = 0; i < io_end->num_io_pages; i++) {
- struct page *page = io_end->pages[i]->p_page;
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ struct bio_vec *bvec = &bio->bi_io_vec[i];
+ struct page *page = bvec->bv_page;
struct buffer_head *bh, *head;
- loff_t offset;
- loff_t io_end_offset;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
if (error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
- head = page_buffers(page);
- BUG_ON(!head);
-
- io_end_offset = io_end->offset + io_end->size;
-
- offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
- bh = head;
- do {
- if ((offset >= io_end->offset) &&
- (offset+bh->b_size <= io_end_offset))
- buffer_io_error(bh);
-
- offset += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
}
-
- put_io_page(io_end->pages[i]);
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + blocksize > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
}
- io_end->num_io_pages = 0;
- inode = io_end->inode;
+ bio_put(bio);
if (error) {
io_end->flag |= EXT4_IO_END_ERROR;
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
bi_sector >> (inode->i_blkbits - 9));
}
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- return;
- }
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio);
}
io->io_bio = NULL;
- io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
io->io_end = NULL;
}
-static int io_submit_init(struct ext4_io_submit *io,
- struct inode *inode,
- struct writeback_control *wbc,
- struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
-
- io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+ bio->bi_private = ext4_get_io_end(io->io_end);
+ if (!io->io_end->size)
+ io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
+ + bh_offset(bh);
io->io_bio = bio;
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
- struct ext4_io_page *io_page,
struct inode *inode,
- struct writeback_control *wbc,
struct buffer_head *bh)
{
ext4_io_end_t *io_end;
int ret;
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init(io, inode, wbc, bh);
+ ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
- io_end = io->io_end;
- if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
- (io_end->pages[io_end->num_io_pages-1] != io_page))
- goto submit_and_retry;
- if (buffer_uninit(bh))
- ext4_set_io_unwritten_flag(inode, io_end);
- io->io_end->size += bh->b_size;
- io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- if ((io_end->num_io_pages == 0) ||
- (io_end->pages[io_end->num_io_pages-1] != io_page)) {
- io_end->pages[io_end->num_io_pages++] = io_page;
- atomic_inc(&io_page->p_count);
- }
+ io_end = io->io_end;
+ if (test_clear_buffer_uninit(bh))
+ ext4_set_io_unwritten_flag(inode, io_end);
+ io_end->size += bh->b_size;
+ io->io_next_block++;
return 0;
}
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- unsigned block_start, block_end, blocksize;
- struct ext4_io_page *io_page;
+ unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
+ int nr_submitted = 0;
blocksize = 1 << inode->i_blkbits;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
- if (!io_page) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return -ENOMEM;
- }
- io_page->p_page = page;
- atomic_set(&io_page->p_count, 1);
- get_page(page);
set_page_writeback(page);
ClearPageError(page);
- for (bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start = block_end, bh = bh->b_this_page) {
-
- block_end = block_start + blocksize;
+ /*
+ * In the first loop we prepare and mark buffers to submit. We have to
+ * mark all buffers in the page before submitting so that
+ * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * on the first buffer finishes and we are still working on submitting
+ * the second buffer.
+ */
+ bh = head = page_buffers(page);
+ do {
+ block_start = bh_offset(bh);
if (block_start >= len) {
/*
* Comments copied from block_write_full_page_endio:
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* mapped, and writes to that region are not written
* out to the file."
*/
- zero_user_segment(page, block_start, block_end);
+ zero_user_segment(page, block_start,
+ block_start + blocksize);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
- ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ set_buffer_async_write(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Now submit buffers to write */
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_async_write(bh))
+ continue;
+ ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
redirty_page_for_writepage(wbc, page);
break;
}
+ nr_submitted++;
clear_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Error stopped previous loop? Clean up buffers... */
+ if (ret) {
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
}
unlock_page(page);
- /*
- * If the page was truncated before we could do the writeback,
- * or we had a memory allocation error while trying to write
- * the first buffer head, we won't have submitted any pages for
- * I/O. In that case we need to make sure we've cleared the
- * PageWriteback bit from the page to prevent the system from
- * wedging later on.
- */
- put_io_page(io_page);
+ /* Nothing submitted - we have to end page writeback */
+ if (!nr_submitted)
+ end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c169477a62c9..b27c96d01965 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ next_group:
if (start_blk >= last_blk)
goto next_group;
group_data[bb_index].block_bitmap = start_blk++;
- ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
@@ -284,7 +284,7 @@ next_group:
if (start_blk >= last_blk)
goto next_group;
group_data[ib_index].inode_bitmap = start_blk++;
- ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
@@ -296,7 +296,7 @@ next_group:
if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
- ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count -=
EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_group_t group;
int err;
- ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ group = ext4_get_group_number(sb, block);
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
/* Update the global fs size fields */
sbi->s_groups_count += flex_gd->count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
* active. */
@@ -1879,7 +1881,11 @@ retry:
/* Nothing need to do */
return 0;
- ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+ n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+ if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ ext4_warning(sb, "resize would cause inodes_count overflow");
+ return -EINVAL;
+ }
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..dbc7c090c13a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce, *tmp;
+ struct ext4_journal_cb_entry *jce;
+ BUG_ON(txn->t_state == T_FINISHED);
spin_lock(&sbi->s_md_lock);
- list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+ while (!list_empty(&txn->t_private_list)) {
+ jce = list_entry(txn->t_private_list.next,
+ struct ext4_journal_cb_entry, jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
if ((sbi->s_es->s_feature_ro_compat &
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
/* Use new metadata_csum algorithm */
- __u16 old_csum;
+ __le16 save_csum;
__u32 csum32;
- old_csum = gdp->bg_checksum;
+ save_csum = gdp->bg_checksum;
gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
sbi->s_desc_size);
- gdp->bg_checksum = old_csum;
+ gdp->bg_checksum = save_csum;
crc = csum32 & 0xFFFF;
goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
int offset;
};
-static int parse_strtoul(const char *buf,
- unsigned long max, unsigned long *value)
+static int parse_strtoull(const char *buf,
+ unsigned long long max, unsigned long long *value)
{
- char *endp;
-
- *value = simple_strtoul(skip_spaces(buf), &endp, 0);
- endp = skip_spaces(endp);
- if (*endp || *value > max)
- return -EINVAL;
+ int ret;
- return 0;
+ ret = kstrtoull(skip_spaces(buf), 0, value);
+ if (!ret && *value > max)
+ ret = -EINVAL;
+ return ret;
}
static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0x40000000, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
- if (t && !is_power_of_2(t))
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0xffffffff, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
*ui = t;
return count;
}
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ int ret;
+
+ if (parse_strtoull(buf, -1ULL, &val))
+ return -EINVAL;
+ ret = ext4_reserve_clusters(sbi, val);
+
+ return ret ? ret : count;
+}
+
static ssize_t trigger_test_error(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+ ext4_fsblk_t resv_clusters;
+
+ /*
+ * By default we reserve 2% or 4096 clusters, whichever is smaller.
+ * This should cover the situations where we can not afford to run
+ * out of space like for example punch hole, or converting
+ * uninitialized extents in delalloc path. In most cases such
+ * allocation would require 1, or 2 blocks, higher numbers are
+ * very rare.
+ */
+ resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+ do_div(resv_clusters, 50);
+ resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+ return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+ ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits;
+
+ if (count >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, count);
+ return 0;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+ /* Do we have standard group size of blocksize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == blocksize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.function = print_daily_error_info;
sbi->s_err_report.data = (unsigned long) sb;
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sb);
+
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
- /* Register extent status tree shrinker */
- ext4_es_register_shrinker(sb);
-
/*
* set up enough so that it can read an inode
*/
@@ -3911,6 +3978,13 @@ no_journal:
"available");
}
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+ "reserved pool", ext4_calculate_resv_clusters(sbi));
+ goto failed_mount4a;
+ }
+
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ failed_mount_wq:
sbi->s_journal = NULL;
}
failed_mount3:
+ ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- ext4_fsblk_t overhead = 0;
+ ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
s64 bfree;
+ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
if (!test_opt(sb, MINIX_DF))
overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
- buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
- if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_bavail = buf->f_bfree -
+ (ext4_r_blocks_count(es) + resv_blocks);
+ if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
return PTR_ERR(qf_inode);
}
+ /* Don't account quota for quota files to avoid recursion */
+ qf_inode->i_flags |= S_NOQUOTA;
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a120b277240..c081e34f717f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
struct ext4_xattr_header *hdr)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u32 csum, old;
+ __u32 csum;
+ __le32 save_csum;
+ __le64 dsk_block_nr = cpu_to_le64(block_nr);
- old = hdr->h_checksum;
+ save_csum = hdr->h_checksum;
hdr->h_checksum = 0;
- block_nr = cpu_to_le64(block_nr);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
- sizeof(block_nr));
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
EXT4_BLOCK_SIZE(inode->i_sb));
- hdr->h_checksum = old;
+ hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index aa25deb5c6cd..c767dbdd7fc4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
+#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
- int i, to_free = 0;
+ int i;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
- commit_transaction->t_state = T_FINISHED;
+ commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
journal->j_average_commit_time*3) / 4;
else
journal->j_average_commit_time = commit_time;
+
write_unlock(&journal->j_state_lock);
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
- __jbd2_journal_drop_transaction(journal, commit_transaction);
- to_free = 1;
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
} else {
- if (journal->j_checkpoint_transactions == NULL) {
- journal->j_checkpoint_transactions = commit_transaction;
- commit_transaction->t_cpnext = commit_transaction;
- commit_transaction->t_cpprev = commit_transaction;
- } else {
- commit_transaction->t_cpnext =
- journal->j_checkpoint_transactions;
- commit_transaction->t_cpprev =
- commit_transaction->t_cpnext->t_cpprev;
- commit_transaction->t_cpnext->t_cpprev =
- commit_transaction;
- commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
- }
}
spin_unlock(&journal->j_list_lock);
-
+ /* Drop all spin_locks because commit_callback may be block.
+ * __journal_remove_checkpoint() can not destroy transaction
+ * under us because it is not marked as T_FINISHED yet */
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
- if (to_free)
- jbd2_journal_free_transaction(commit_transaction);
+ write_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ commit_transaction->t_state = T_FINISHED;
+ /* Recheck checkpoint lists after j_list_lock was dropped */
+ if (commit_transaction->t_checkpoint_list == NULL &&
+ commit_transaction->t_checkpoint_io_list == NULL) {
+ __jbd2_journal_drop_transaction(journal, commit_transaction);
+ jbd2_journal_free_transaction(commit_transaction);
+ }
+ spin_unlock(&journal->j_list_lock);
+ write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8b220f1ab54f..f6c5ba027f4f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -708,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed. If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete. If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+ int need_to_wait = 1;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == tid) {
+ if (journal->j_commit_request != tid) {
+ /* transaction not yet started, so request it */
+ read_unlock(&journal->j_state_lock);
+ jbd2_log_start_commit(journal, tid);
+ goto wait_commit;
+ }
+ } else if (!(journal->j_committing_transaction &&
+ journal->j_committing_transaction->t_tid == tid))
+ need_to_wait = 0;
+ read_unlock(&journal->j_state_lock);
+ if (!need_to_wait)
+ return 0;
+wait_commit:
+ return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
+/*
* Log buffer allocation routines:
*/
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..10f524c59ea8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
+ unsigned long start_lock, time_lock;
if (is_handle_aborted(handle))
return -EROFS;
@@ -655,9 +655,16 @@ repeat:
/* @@@ Need to check for errors here at some point. */
+ start_lock = jiffies;
lock_buffer(bh);
jbd_lock_bh_state(bh);
+ /* If it takes too long to lock the buffer, trace it */
+ time_lock = jbd2_time_diff(start_lock, jiffies);
+ if (time_lock > HZ/10)
+ trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+ jiffies_to_msecs(time_lock));
+
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 4c16c4a88d47..9e52b0626b39 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
BH_Write_EIO, /* I/O error on write */
BH_Unwritten, /* Buffer is allocated on disk but not written */
BH_Quiet, /* Buffer Error Prinks to be quiet */
+ BH_Meta, /* Buffer contains metadata */
+ BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Meta, meta)
+BUFFER_FNS(Prio, prio)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 9978b614a1aa..dfac5ed31120 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -112,6 +112,8 @@ struct dma_buf_ops {
* @file: file pointer used for sharing buffers across, and for refcounting.
* @attachments: list of dma_buf_attachment that denotes all devices attached.
* @ops: dma_buf_ops associated with this buffer object.
+ * @exp_name: name of the exporter; useful for debugging.
+ * @list_node: node for dma_buf accounting and debugging.
* @priv: exporter specific private data for this buffer object.
*/
struct dma_buf {
@@ -123,6 +125,8 @@ struct dma_buf {
struct mutex lock;
unsigned vmapping_counter;
void *vmap_ptr;
+ const char *exp_name;
+ struct list_head list_node;
void *priv;
};
@@ -162,8 +166,13 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
struct device *dev);
void dma_buf_detach(struct dma_buf *dmabuf,
struct dma_buf_attachment *dmabuf_attach);
-struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
- size_t size, int flags);
+
+struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops,
+ size_t size, int flags, const char *);
+
+#define dma_buf_export(priv, ops, size, flags) \
+ dma_buf_export_named(priv, ops, size, flags, __FILE__)
+
int dma_buf_fd(struct dma_buf *dmabuf, int flags);
struct dma_buf *dma_buf_get(int fd);
void dma_buf_put(struct dma_buf *dmabuf);
@@ -185,5 +194,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
unsigned long);
void *dma_buf_vmap(struct dma_buf *);
void dma_buf_vunmap(struct dma_buf *, void *vaddr);
-
+int dma_buf_debugfs_create_file(const char *name,
+ int (*write)(struct seq_file *));
#endif /* __DMA_BUF_H__ */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 50e5a5e6a712..6e051f472edb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
T_COMMIT,
T_COMMIT_DFLUSH,
T_COMMIT_JFLUSH,
+ T_COMMIT_CALLBACK,
T_FINISHED
} t_state;
@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
{
- return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+ return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
}
static inline void jbd2_free_handle(handle_t *handle)
@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_journal_force_commit_nested(journal_t *journal);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index c18b46f8aeeb..13a3da25ff07 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
/*
* Journalling list for this buffer [jbd_lock_bh_state()]
*/
- unsigned b_jlist;
+ unsigned b_jlist:4;
/*
* This flag signals the buffer has been modified by
* the currently running transaction
* [jbd_lock_bh_state()]
*/
- unsigned b_modified;
-
- /*
- * This feild tracks the last transaction id in which this buffer
- * has been cowed
- * [jbd_lock_bh_state()]
- */
- tid_t b_cow_tid;
+ unsigned b_modified:1;
/*
* Copy of the buffer data frozen for writing to the log.
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4ee471003859..d0e686402df8 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end,
__entry->pos, __entry->len, __entry->copied)
);
-DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
-
- TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
- unsigned int copied),
-
- TP_ARGS(inode, pos, len, copied)
-);
-
-DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_write_end,
TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
unsigned int copied),
@@ -1956,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks,
__entry->to = to;
__entry->partial = partial_cluster;
__entry->ee_pblk = ext4_ext_pblock(ex);
- __entry->ee_lblk = cpu_to_le32(ex->ee_block);
+ __entry->ee_lblk = le32_to_cpu(ex->ee_block);
__entry->ee_len = ext4_ext_get_actual_len(ex);
),
@@ -2060,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space,
TRACE_EVENT(ext4_ext_remove_space_done,
TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
- ext4_lblk_t partial, unsigned short eh_entries),
+ ext4_lblk_t partial, __le16 eh_entries),
TP_ARGS(inode, start, depth, partial, eh_entries),
@@ -2079,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
__entry->start = start;
__entry->depth = depth;
__entry->partial = partial;
- __entry->eh_entries = eh_entries;
+ __entry->eh_entries = le16_to_cpu(eh_entries);
),
TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 070df49e4a1d..c1d1f3eb242d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock,
MINOR(__entry->dev), __entry->write_op)
);
+TRACE_EVENT(jbd2_lock_buffer_stall,
+
+ TP_PROTO(dev_t dev, unsigned long stall_ms),
+
+ TP_ARGS(dev, stall_ms),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field(unsigned long, stall_ms )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev;
+ __entry->stall_ms = stall_ms;
+ ),
+
+ TP_printk("dev %d,%d stall_ms %lu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->stall_ms)
+);
+
#endif /* _TRACE_JBD2_H */
/* This part must be outside protection */