9 files changed, 855 insertions, 1 deletions
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 7a21d0bf7134..19e3c8c77f01 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -75,6 +75,27 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr)
  * Private support functions: these are not part of the API and are
  * liable to change.  Drivers must not use these.
  */
+#if defined(CONFIG_NON_ALIASED_COHERENT_MEM)
+static inline void __dma_single_cpu_to_dev(struct device *dev,
+	const void *kaddr, size_t size, enum dma_data_direction dir)
+{
+	extern void ___dma_single_cpu_to_dev(struct device *dev,
+		const void *, size_t, enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_single_cpu_to_dev(dev, kaddr, size, dir);
+}
+
+static inline void __dma_single_dev_to_cpu(struct device *dev,
+	const void *kaddr, size_t size, enum dma_data_direction dir)
+{
+	extern void ___dma_single_dev_to_cpu(struct device *dev,
+		const void *, size_t, enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_single_dev_to_cpu(dev, kaddr, size, dir);
+}
+#else
 static inline void __dma_single_cpu_to_dev(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
@@ -94,6 +115,7 @@ static inline void __dma_single_dev_to_cpu(const void *kaddr, size_t size,
 	if (!arch_is_coherent())
 		___dma_single_dev_to_cpu(kaddr, size, dir);
 }
+#endif
 
 static inline void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
@@ -199,8 +221,12 @@ int dma_mmap_coherent(struct device *, struct vm_area_struct *,
 extern void *dma_alloc_writecombine(struct device *, size_t, dma_addr_t *,
 		gfp_t);
 
+#if defined(CONFIG_NON_ALIASED_COHERENT_MEM)
+extern void dma_free_writecombine(struct device *, size_t, void *, dma_addr_t);
+#else
 #define dma_free_writecombine(dev,size,cpu_addr,handle) \
 	dma_free_coherent(dev,size,cpu_addr,handle)
+#endif
 
 int dma_mmap_writecombine(struct device *, struct vm_area_struct *,
 		void *, dma_addr_t, size_t);
@@ -421,7 +447,12 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
 	if (!dmabounce_sync_for_cpu(dev, handle, offset, size, dir))
 		return;
 
+#if defined(CONFIG_NON_ALIASED_COHERENT_MEM)
+	__dma_single_dev_to_cpu(dev, dma_to_virt(dev, handle) + offset,
+		size, dir);
+#else
 	__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
+#endif
 }
 
 static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -435,7 +466,12 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	if (!dmabounce_sync_for_device(dev, handle, offset, size, dir))
 		return;
 
+#if defined(CONFIG_NON_ALIASED_COHERENT_MEM)
+	__dma_single_cpu_to_dev(dev, dma_to_virt(dev, handle) + offset,
+		size, dir);
+#else
 	__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
+#endif
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h
index d2fedb5aeb1f..3845215404dd 100644
--- a/arch/arm/include/asm/mach/map.h
+++ b/arch/arm/include/asm/mach/map.h
@@ -29,6 +29,8 @@ struct map_desc {
 #define MT_MEMORY_NONCACHED	11
 #define MT_MEMORY_DTCM		12
 #define MT_MEMORY_ITCM		13
+#define MT_DMA_COHERENT		14
+#define MT_WC_COHERENT		15
 
 #ifdef CONFIG_MMU
 extern void iotable_init(struct map_desc *, int);
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 25669795d80d..cf7c02042f60 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -88,6 +88,13 @@
 #define CONSISTENT_END		(0xffe00000UL)
 #define CONSISTENT_BASE		(CONSISTENT_END - CONSISTENT_DMA_SIZE)
 
+#ifndef CONSISTENT_WC_SIZE
+#define CONSISTENT_WC_SIZE	SZ_2M
+#endif
+
+#define CONSISTENT_WC_END	CONSISTENT_BASE
+#define CONSISTENT_WC_BASE	(CONSISTENT_WC_END - CONSISTENT_WC_SIZE)
+
 #else /* CONFIG_MMU */
 
 /*
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index aaea6d487bae..e23ad5ebc330 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -885,6 +885,17 @@ config ARM_DMA_MEM_BUFFERABLE
 
 	  You are recommended say 'Y' here and debug any affected drivers.
 
+config NON_ALIASED_COHERENT_MEM
+	bool "Avoid aliasing mappings in DMA coherent allocator" if CPU_V7
+	help
+	  Avoid multiple mappings with DMA coherent/writecombine allocator by
+	  pre-allocating the mappings, and removing that memory from the
+	  system memory mapping.
+
+	  This will make sure DMA memory is fully coherent, so cache
+	  dma_sync APIs will not be necessary to maintain cache cohereny
+	  for DMA coherent region.
+
 config ARCH_HAS_BARRIERS
 	bool
 	help
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 47e2e3ba1902..c3ef2961ddff 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -2,9 +2,15 @@
 # Makefile for the linux arm-specific parts of the memory manager.
 #
 
-obj-y				:= dma-mapping.o extable.o fault.o init.o \
+obj-y				:= extable.o fault.o init.o \
 				   iomap.o
 
+ifeq ($(CONFIG_NON_ALIASED_COHERENT_MEM),y)
+obj-y				+= dma-na-mapping.o
+else
+obj-y				+= dma-mapping.o
+endif
+
 obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
 				   mmap.o pgd.o mmu.o vmregion.o pageattr.o
 
diff --git a/arch/arm/mm/dma-na-mapping.c b/arch/arm/mm/dma-na-mapping.c
new file mode 100644
index 000000000000..4e5d52dda285
--- /dev/null
+++ b/arch/arm/mm/dma-na-mapping.c
@@ -0,0 +1,748 @@
+/*
+ *  linux/arch/arm/mm/dma-mapping.c
+ *
+ *  Copyright (C) 2000-2004 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  DMA uncached mapping support.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/memory.h>
+#include <asm/highmem.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/sizes.h>
+#include <asm/mach/map.h>
+
+#include "mm.h"
+
+static u64 get_coherent_dma_mask(struct device *dev)
+{
+	u64 mask = (u64)arm_dma_limit;
+
+	if (dev) {
+		mask = dev->coherent_dma_mask;
+
+		/*
+		 * Sanity check the DMA mask - it must be non-zero, and
+		 * must be able to be satisfied by a DMA allocation.
+		 */
+		if (mask == 0) {
+			dev_warn(dev, "coherent DMA mask is unset\n");
+			return 0;
+		}
+
+		if ((~mask) & (u64)arm_dma_limit) {
+			dev_warn(dev, "coherent DMA mask %#llx is smaller "
+				 "than system GFP_DMA mask %#llx\n",
+				 mask, (u64)arm_dma_limit);
+			return 0;
+		}
+	}
+
+	return mask;
+}
+
+/*
+ * Allocate a DMA buffer for 'dev' of size 'size' using the
+ * specified gfp mask.  Note that 'size' must be page aligned.
+ */
+static struct page *__dma_alloc_buffer(struct device *dev,
+	size_t size, gfp_t gfp)
+{
+	unsigned long order = get_order(size);
+	struct page *page, *p, *e;
+	void *ptr;
+	u64 mask = get_coherent_dma_mask(dev);
+
+#ifdef CONFIG_DMA_API_DEBUG
+	u64 limit = (mask + 1) & ~mask;
+	if (limit && size >= limit) {
+		dev_warn(dev, "coherent allocation too big"
+			"(requested %#x mask %#llx)\n",
+			size, mask);
+		return NULL;
+	}
+#endif
+
+	if (!mask)
+		return NULL;
+
+	if (mask < 0xffffffffULL)
+		gfp |= GFP_DMA;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		return NULL;
+
+	/*
+	 * Now split the huge page and free the excess pages
+	 */
+	split_page(page, order);
+	for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order);
+		p < e; p++)
+		__free_page(p);
+
+	/*
+	 * Ensure that the allocated pages are zeroed, and that any data
+	 * lurking in the kernel direct-mapped region is invalidated.
+	 */
+	ptr = page_address(page);
+	memset(ptr, 0, size);
+	dmac_flush_range(ptr, ptr + size);
+	outer_flush_range(__pa(ptr), __pa(ptr) + size);
+
+	return page;
+}
+
+/*
+ * Free a DMA buffer.  'size' must be page aligned.
+ */
+static void __dma_free_buffer(struct page *page, size_t size)
+{
+	struct page *e = page + (size >> PAGE_SHIFT);
+
+	while (page < e) {
+		__free_page(page);
+		page++;
+	}
+}
+
+#ifdef CONFIG_MMU
+/* Sanity check sizes */
+#if CONSISTENT_DMA_SIZE % SECTION_SIZE
+#error "CONSISTENT_DMA_SIZE must be a multiple of the section size"
+#endif
+#if CONSISTENT_WC_SIZE % SECTION_SIZE
+#error "CONSISTENT_WC_SIZE must be a multiple of the section size"
+#endif
+#if ((CONSISTENT_DMA_SIZE + CONSISTENT_WC_SIZE) % SZ_2M)
+#error "Sum of CONSISTENT_DMA_SIZE and CONSISTENT_WC_SIZE must " \
+	"be multiple of 2MiB"
+#endif
+
+#include "vmregion.h"
+
+struct dma_coherent_area {
+	struct arm_vmregion_head vm;
+	unsigned long pfn;
+	unsigned long pfn_end;
+	unsigned int type;
+	const char *name;
+};
+
+static struct dma_coherent_area coherent_wc_head = {
+	.vm = {
+		.vm_start	= CONSISTENT_WC_BASE,
+		.vm_end		= CONSISTENT_WC_END,
+	},
+	.pfn = 0,
+	.pfn_end = 0,
+	.type = MT_WC_COHERENT,
+	.name = "WC ",
+};
+
+static struct dma_coherent_area coherent_dma_head = {
+	.vm = {
+		.vm_start	= CONSISTENT_BASE,
+		.vm_end		= CONSISTENT_END,
+	},
+	.pfn = 0,
+	.pfn_end = 0,
+	.type = MT_DMA_COHERENT,
+	.name = "DMA coherent ",
+};
+
+static struct dma_coherent_area *coherent_areas[2] __initdata =	{
+	&coherent_wc_head,
+	&coherent_dma_head
+};
+
+static struct dma_coherent_area *coherent_map[2];
+#define coherent_wc_area coherent_map[0]
+#define coherent_dma_area coherent_map[1]
+
+void dma_coherent_reserve(void)
+{
+	phys_addr_t base;
+	unsigned long size;
+	int can_share, i;
+
+	if (arch_is_coherent())
+		return;
+
+#ifdef CONFIG_ARM_DMA_MEM_BUFFERABLE
+	/* ARMv6: only when DMA_MEM_BUFFERABLE is enabled */
+	can_share = cpu_architecture() >= CPU_ARCH_ARMv6;
+#else
+	/* ARMv7+: WC and DMA areas have the same properties, so can share */
+	can_share = cpu_architecture() >= CPU_ARCH_ARMv7;
+#endif
+	if (can_share) {
+		coherent_wc_head.name = "DMA coherent/WC ";
+		coherent_wc_head.vm.vm_end = coherent_dma_head.vm.vm_end;
+		coherent_dma_head.vm.vm_start = coherent_dma_head.vm.vm_end;
+		coherent_dma_area = coherent_wc_area = &coherent_wc_head;
+	} else {
+		memcpy(coherent_map, coherent_areas, sizeof(coherent_map));
+	}
+
+	for (i = 0; i < ARRAY_SIZE(coherent_areas); i++) {
+		struct dma_coherent_area *area = coherent_areas[i];
+
+		size = area->vm.vm_end - area->vm.vm_start;
+		if (!size)
+			continue;
+
+		spin_lock_init(&area->vm.vm_lock);
+		INIT_LIST_HEAD(&area->vm.vm_list);
+
+		base = memblock_end_of_DRAM() - size;
+		memblock_free(base, size);
+		memblock_remove(base, size);
+
+		area->pfn = __phys_to_pfn(base);
+		area->pfn_end = __phys_to_pfn(base+size);
+
+		pr_info("DMA: %luMiB %smemory allocated at 0x%08llx phys\n",
+			size / 1048576, area->name, (unsigned long long)base);
+	}
+}
+
+void __init dma_coherent_mapping(void)
+{
+	struct map_desc map[ARRAY_SIZE(coherent_areas)];
+	int nr;
+
+	for (nr = 0; nr < ARRAY_SIZE(map); nr++) {
+		struct dma_coherent_area *area = coherent_areas[nr];
+
+		map[nr].pfn = area->pfn;
+		map[nr].virtual = area->vm.vm_start;
+		map[nr].length = area->vm.vm_end - area->vm.vm_start;
+		map[nr].type = area->type;
+		if (map[nr].length == 0)
+			break;
+	}
+
+	iotable_init(map, nr);
+}
+
+static void *dma_alloc_area(size_t size, unsigned long *pfn, gfp_t gfp,
+	struct dma_coherent_area *area)
+{
+	struct arm_vmregion *c;
+	size_t align;
+	int bit;
+
+	/*
+	 * Align the virtual region allocation - maximum alignment is
+	 * a section size, minimum is a page size.  This helps reduce
+	 * fragmentation of the DMA space, and also prevents allocations
+	 * smaller than a section from crossing a section boundary.
+	 */
+	bit = fls(size - 1);
+	if (bit > SECTION_SHIFT)
+		bit = SECTION_SHIFT;
+	align = 1 << bit;
+
+	/*
+	 * Allocate a virtual address in the consistent mapping region.
+	 */
+	c = arm_vmregion_alloc(&area->vm, align, size,
+			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+	if (!c)
+		return NULL;
+
+	memset((void *)c->vm_start, 0, size);
+	*pfn = area->pfn + ((c->vm_start - area->vm.vm_start) >> PAGE_SHIFT);
+	c->vm_pages = pfn_to_page(*pfn);
+
+	return (void *)c->vm_start;
+}
+
+static void dma_free_area(void *cpu_addr, size_t size,
+	struct dma_coherent_area *area)
+{
+	struct arm_vmregion *c;
+
+	c = arm_vmregion_find_remove(&area->vm, (unsigned long)cpu_addr);
+	if (!c) {
+		printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+		       __func__, cpu_addr);
+		dump_stack();
+		return;
+	}
+
+	if ((c->vm_end - c->vm_start) != size) {
+		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	arm_vmregion_free(&area->vm, c);
+}
+
+#define nommu() (0)
+
+#else	/* !CONFIG_MMU */
+
+#define dma_alloc_area(size, pfn, gfp, area)	({ *(pfn) = 0; NULL })
+#define dma_free_area(addr, size, area)		do { } while (0)
+
+#define nommu()	(1)
+#define coherent_wc_area NULL
+#define coherent_dma_area NULL
+
+void dma_coherent_reserve(void)
+{
+}
+
+#endif	/* CONFIG_MMU */
+
+static void *
+__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
+	    struct dma_coherent_area *area)
+{
+
+	unsigned long pfn;
+	void *ret;
+	/* Following is a work-around (a.k.a. hack) to prevent pages
+	 * with __GFP_COMP being passed to split_page() which cannot
+	 * handle them.  The real problem is that this flag probably
+	 * should be 0 on ARM as it is not supported on this
+	 * platform--see CONFIG_HUGETLB_PAGE. */
+	gfp &= ~(__GFP_COMP);
+
+	*handle = ~0;
+	size = PAGE_ALIGN(size);
+
+	if (arch_is_coherent() || nommu()) {
+		struct page *page = __dma_alloc_buffer(dev, size, gfp);
+		if (!page)
+			return NULL;
+		pfn = page_to_pfn(page);
+		ret = page_address(page);
+	} else {
+		ret = dma_alloc_area(size, &pfn, gfp, area);
+	}
+
+	if (ret)
+		*handle = pfn_to_dma(dev, pfn);
+
+	return ret;
+}
+
+static void __dma_free(struct device *dev, size_t size, void *cpu_addr,
+	dma_addr_t handle, struct dma_coherent_area *area)
+{
+	size = PAGE_ALIGN(size);
+
+	if (arch_is_coherent() || nommu())
+		__dma_free_buffer(pfn_to_page(dma_to_pfn(dev, handle)), size);
+	else
+		dma_free_area(cpu_addr, size, area);
+}
+
+/*
+ * Allocate DMA-coherent memory space and return both the kernel remapped
+ * virtual and bus address for that space.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size,
+	dma_addr_t *handle, gfp_t gfp)
+{
+	void *memory;
+
+	if (dma_alloc_from_coherent(dev, size, handle, &memory))
+		return memory;
+
+	return __dma_alloc(dev, size, handle, gfp, coherent_dma_area);
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Allocate a writecombining region, in much the same way as
+ * dma_alloc_coherent above.
+ */
+void *
+dma_alloc_writecombine(struct device *dev, size_t size,
+	dma_addr_t *handle, gfp_t gfp)
+{
+	return __dma_alloc(dev, size, handle, gfp, coherent_wc_area);
+}
+EXPORT_SYMBOL(dma_alloc_writecombine);
+
+static int dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		    void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		    struct dma_coherent_area *area)
+{
+	int ret = -ENXIO;
+#ifdef CONFIG_MMU
+	unsigned long user_size, kern_size;
+	struct arm_vmregion *c;
+
+	user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	c = arm_vmregion_find(&area->vm, (unsigned long)cpu_addr);
+	if (c) {
+		unsigned long off = vma->vm_pgoff;
+
+		kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT;
+
+		if (off < kern_size &&
+		    user_size <= (kern_size - off)) {
+			ret = remap_pfn_range(vma, vma->vm_start,
+					      page_to_pfn(c->vm_pages) + off,
+					      user_size << PAGE_SHIFT,
+					      vma->vm_page_prot);
+		}
+	}
+#endif	/* CONFIG_MMU */
+
+	return ret;
+}
+
+int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+		      void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	vma->vm_page_prot = pgprot_dmacoherent(vma->vm_page_prot);
+	return dma_mmap(dev, vma, cpu_addr, dma_addr, size, coherent_dma_area);
+}
+EXPORT_SYMBOL(dma_mmap_coherent);
+
+int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma,
+			  void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	return dma_mmap(dev, vma, cpu_addr, dma_addr, size, coherent_wc_area);
+}
+EXPORT_SYMBOL(dma_mmap_writecombine);
+
+/*
+ * free a page as defined by the above mapping.
+ * Must not be called with IRQs disabled.
+ */
+void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+	dma_addr_t handle)
+{
+	WARN_ON(irqs_disabled());
+
+	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+		return;
+
+	__dma_free(dev, size, cpu_addr, handle, coherent_dma_area);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+void dma_free_writecombine(struct device *dev, size_t size, void *cpu_addr,
+	dma_addr_t handle)
+{
+	WARN_ON(irqs_disabled());
+
+	__dma_free(dev, size, cpu_addr, handle, coherent_wc_area);
+}
+EXPORT_SYMBOL(dma_free_writecombine);
+
+#define is_coherent_pfn(x) ((((x) >= coherent_wc_head.pfn) &&	\
+			((x) < coherent_wc_head.pfn_end)) ||	\
+			(((x) >= coherent_dma_head.pfn) &&	\
+			((x) < coherent_dma_head.pfn_end)))
+/*
+ * Make an area consistent for devices.
+ * Note: Drivers should NOT use this function directly, as it will break
+ * platforms with CONFIG_DMABOUNCE.
+ * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
+ */
+void ___dma_single_cpu_to_dev(struct device *dev, const void *kaddr,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr;
+	unsigned long kaddr_pfn;
+
+	kaddr_pfn = dma_to_pfn(dev, virt_to_dma(dev, (void *)kaddr));
+
+	if (is_coherent_pfn(kaddr_pfn))
+		return;
+
+	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
+
+	dmac_map_area(kaddr, size, dir);
+
+	paddr = __pa(kaddr);
+	if (dir == DMA_FROM_DEVICE)
+		outer_inv_range(paddr, paddr + size);
+	else
+		outer_clean_range(paddr, paddr + size);
+
+	/* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+EXPORT_SYMBOL(___dma_single_cpu_to_dev);
+
+void ___dma_single_dev_to_cpu(struct device *dev, const void *kaddr,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long kaddr_pfn;
+
+	kaddr_pfn = dma_to_pfn(dev, virt_to_dma(dev, (void *)kaddr));
+
+	if (is_coherent_pfn(kaddr_pfn))
+		return;
+
+	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
+
+	/* FIXME: non-speculating: not required */
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+		unsigned long paddr = __pa(kaddr);
+		outer_inv_range(paddr, paddr + size);
+	}
+
+	dmac_unmap_area(kaddr, size, dir);
+}
+EXPORT_SYMBOL(___dma_single_dev_to_cpu);
+
+static void dma_cache_maint_page(struct page *page, unsigned long offset,
+	size_t size, enum dma_data_direction dir,
+	void (*op)(const void *, size_t, int))
+{
+	/*
+	 * A single sg entry may refer to multiple physically contiguous
+	 * pages.  But we still need to process highmem pages individually.
+	 * If highmem is not configured then the bulk of this loop gets
+	 * optimized out.
+	 */
+	size_t left = size;
+	do {
+		size_t len = left;
+		void *vaddr;
+
+		if (PageHighMem(page)) {
+			if (len + offset > PAGE_SIZE) {
+				if (offset >= PAGE_SIZE) {
+					page += offset / PAGE_SIZE;
+					offset %= PAGE_SIZE;
+				}
+				len = PAGE_SIZE - offset;
+			}
+			vaddr = kmap_high_get(page);
+			if (vaddr) {
+				vaddr += offset;
+				op(vaddr, len, dir);
+				kunmap_high(page);
+			} else if (cache_is_vipt()) {
+				/* unmapped pages might still be cached */
+				vaddr = kmap_atomic(page);
+				op(vaddr + offset, len, dir);
+				kunmap_atomic(vaddr);
+			}
+		} else {
+			vaddr = page_address(page) + offset;
+			op(vaddr, len, dir);
+		}
+		offset = 0;
+		page++;
+		left -= len;
+	} while (left);
+}
+
+void ___dma_page_cpu_to_dev(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr;
+
+	dma_cache_maint_page(page, off, size, dir, dmac_map_area);
+
+	paddr = page_to_phys(page) + off;
+	if (dir == DMA_FROM_DEVICE)
+		outer_inv_range(paddr, paddr + size);
+	else
+		outer_clean_range(paddr, paddr + size);
+	/* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+EXPORT_SYMBOL(___dma_page_cpu_to_dev);
+
+void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr = page_to_phys(page) + off;
+
+	/* FIXME: non-speculating: not required */
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE)
+		outer_inv_range(paddr, paddr + size);
+
+	dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
+
+	/*
+	 * Mark the D-cache clean for this page to avoid extra flushing.
+	 */
+	if (dir != DMA_TO_DEVICE && off == 0 && size >= PAGE_SIZE)
+		set_bit(PG_dcache_clean, &page->flags);
+}
+EXPORT_SYMBOL(___dma_page_dev_to_cpu);
+
+/**
+ * dma_map_sg - map a set of SG buffers for streaming mode DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map
+ * @dir: DMA transfer direction
+ *
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the dma_map_single interface.
+ * Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}.
+ *
+ * Device ownership issues as mentioned for dma_map_single are the same
+ * here.
+ */
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	struct scatterlist *s;
+	int i, j;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	for_each_sg(sg, s, nents, i) {
+		s->dma_address = __dma_map_page(dev, sg_page(s), s->offset,
+						s->length, dir);
+		if (dma_mapping_error(dev, s->dma_address))
+			goto bad_mapping;
+	}
+	debug_dma_map_sg(dev, sg, nents, nents, dir);
+	return nents;
+
+ bad_mapping:
+	for_each_sg(sg, s, i, j)
+		__dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+	return 0;
+}
+EXPORT_SYMBOL(dma_map_sg);
+
+/**
+ * dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to unmap (same as was passed to dma_map_sg)
+ * @dir: DMA transfer direction (same as was passed to dma_map_sg)
+ *
+ * Unmap a set of streaming mode DMA translations.  Again, CPU access
+ * rules concerning calls here are the same as for dma_unmap_single().
+ */
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+
+	for_each_sg(sg, s, nents, i)
+		__dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+}
+EXPORT_SYMBOL(dma_unmap_sg);
+
+/**
+ * dma_sync_sg_for_cpu
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map (returned from dma_map_sg)
+ * @dir: DMA transfer direction (same as was passed to dma_map_sg)
+ */
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+			int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (!dmabounce_sync_for_cpu(dev, sg_dma_address(s), 0,
+					    sg_dma_len(s), dir))
+			continue;
+
+		__dma_page_dev_to_cpu(sg_page(s), s->offset,
+				      s->length, dir);
+	}
+
+	debug_dma_sync_sg_for_cpu(dev, sg, nents, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+/**
+ * dma_sync_sg_for_device
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map (returned from dma_map_sg)
+ * @dir: DMA transfer direction (same as was passed to dma_map_sg)
+ */
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+			int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (!dmabounce_sync_for_device(dev, sg_dma_address(s), 0,
+					sg_dma_len(s), dir))
+			continue;
+
+		__dma_page_cpu_to_dev(sg_page(s), s->offset,
+				      s->length, dir);
+	}
+
+	debug_dma_sync_sg_for_device(dev, sg, nents, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask
+ * to this function.
+ */
+int dma_supported(struct device *dev, u64 mask)
+{
+	if (mask < (u64)arm_dma_limit)
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+
+#ifndef CONFIG_DMABOUNCE
+	*dev->dma_mask = dma_mask;
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+#define PREALLOC_DMA_DEBUG_ENTRIES	4096
+
+static int __init dma_debug_do_init(void)
+{
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+	return 0;
+}
+fs_initcall(dma_debug_do_init);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index f8037ba338ac..6f81c8e05c3a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -364,6 +364,9 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
 #endif
 
 	arm_mm_memblock_reserve();
+#if defined(CONFIG_NON_ALIASED_COHERENT_MEM)
+	dma_coherent_reserve();
+#endif
 	arm_dt_memblock_reserve();
 
 	/* reserve any platform specific memblock areas */
diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h
index 010566799c80..967a8048f321 100644
--- a/arch/arm/mm/mm.h
+++ b/arch/arm/mm/mm.h
@@ -31,3 +31,8 @@ extern u32 arm_dma_limit;
 
 void __init bootmem_init(void);
 void arm_mm_memblock_reserve(void);
+
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+void dma_coherent_reserve(void);
+void dma_coherent_mapping(void);
+#endif
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4fa9c246ae93..bb80555edac9 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -273,6 +273,18 @@ static struct mem_type mem_types[] = {
 		.prot_l1   = PMD_TYPE_TABLE,
 		.domain    = DOMAIN_KERNEL,
 	},
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+	[MT_DMA_COHERENT] = {
+		.prot_sect	= PMD_TYPE_SECT | PMD_SECT_AP_WRITE |
+				  PMD_SECT_S,
+		.domain		= DOMAIN_IO,
+	},
+	[MT_WC_COHERENT] = {
+		.prot_sect	= PMD_TYPE_SECT | PMD_SECT_AP_WRITE |
+				  PMD_SECT_S,
+		.domain		= DOMAIN_IO,
+	},
+#endif
 };
 
 const struct mem_type *get_mem_type(unsigned int type)
@@ -353,6 +365,9 @@ static void __init build_mem_type_table(void)
 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_XN;
 			mem_types[MT_DEVICE_CACHED].prot_sect |= PMD_SECT_XN;
 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_XN;
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+			mem_types[MT_DMA_COHERENT].prot_sect |= PMD_SECT_XN;
+#endif
 		}
 		if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
 			/*
@@ -457,13 +472,30 @@ static void __init build_mem_type_table(void)
 			/* Non-cacheable Normal is XCB = 001 */
 			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
 				PMD_SECT_BUFFERED;
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+			mem_types[MT_WC_COHERENT].prot_sect |=
+				PMD_SECT_BUFFERED;
+			mem_types[MT_DMA_COHERENT].prot_sect |=
+				PMD_SECT_BUFFERED;
+#endif
 		} else {
 			/* For both ARMv6 and non-TEX-remapping ARMv7 */
 			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
 				PMD_SECT_TEX(1);
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+			mem_types[MT_WC_COHERENT].prot_sect |=
+				PMD_SECT_TEX(1);
+#ifdef CONFIG_ARM_DMA_MEM_BUFFERABLE
+			mem_types[MT_DMA_COHERENT].prot_sect |=
+				PMD_SECT_TEX(1);
+#endif
+#endif
 		}
 	} else {
 		mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_BUFFERABLE;
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+		mem_types[MT_WC_COHERENT].prot_sect |= PMD_SECT_BUFFERED;
+#endif
 	}
 
 	for (i = 0; i < 16; i++) {
@@ -986,6 +1018,10 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
 		create_mapping(&map);
 	}
 
+#ifdef CONFIG_NON_ALIASED_COHERENT_MEM
+	dma_coherent_mapping();
+#endif
+
 	/*
 	 * Ask the machine support to map in the statically mapped devices.
 	 */