summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGary King <GKing@nvidia.com>2010-01-29 17:46:27 -0800
committerGary King <GKing@nvidia.com>2010-02-02 10:52:45 -0800
commit40c86b9c228b7b04300c52dd8955e9dc585ba881 (patch)
tree88ceedf3d19c7abc1afb95863a3f193314fbfbf5
parent8b8ffa33276fc5b2069a14724ce82cdeaf7e848c (diff)
[ARM] expose full data cache clean and flush DMA maintenance
drivers which perform DMA mapping can optimize necessary cache maintenance by using a full-cache clean or flush rather than looping over large regions line-by-line. there was no previous full-cache operation other than flush_kern_cache_all, and this operation both invalidates the data cache (not always necessary) and invalidates the instruction cache, both of which unnecessarily hurt performance on CPUs with Harvard caches Change-Id: If71015525457e9e7e481fc2afcdc76bc3fa8f8f4
-rw-r--r--arch/arm/include/asm/cacheflush.h25
-rw-r--r--arch/arm/kernel/smp.c6
-rw-r--r--arch/arm/mm/cache-v4.S2
-rw-r--r--arch/arm/mm/cache-v4wb.S2
-rw-r--r--arch/arm/mm/cache-v4wt.S2
-rw-r--r--arch/arm/mm/cache-v6.S32
-rw-r--r--arch/arm/mm/cache-v7.S65
7 files changed, 122 insertions, 12 deletions
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 262120acc888..c4fbcc59203b 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -208,6 +208,8 @@ struct cpu_cache_fns {
void (*dma_inv_range)(const void *, const void *);
void (*dma_clean_range)(const void *, const void *);
void (*dma_flush_range)(const void *, const void *);
+ void (*dma_clean_all)(void);
+ void (*dma_flush_all)(void);
};
struct outer_cache_fns {
@@ -239,6 +241,8 @@ extern struct cpu_cache_fns cpu_cache;
#define dmac_inv_range cpu_cache.dma_inv_range
#define dmac_clean_range cpu_cache.dma_clean_range
#define dmac_flush_range cpu_cache.dma_flush_range
+#define dmac_clean_all cpu_cache.dma_clean_all
+#define dmac_flush_all cpu_cache.dma_flush_all
#else
@@ -255,6 +259,8 @@ static inline void v7m_flush_kern_dcache_page(void *a) { }
static inline void v7m_dma_inv_range(const void *a, const void *b) { }
static inline void v7m_dma_clean_range(const void *a, const void *b) { }
static inline void v7m_dma_flush_range(const void *a, const void *b) { }
+static inline void v7m_dma_clean_all(void) { }
+static inline void v7m_dma_flush_all(void) { }
#endif
@@ -281,10 +287,14 @@ extern void __cpuc_flush_dcache_page(void *);
#define dmac_inv_range __glue(_CACHE,_dma_inv_range)
#define dmac_clean_range __glue(_CACHE,_dma_clean_range)
#define dmac_flush_range __glue(_CACHE,_dma_flush_range)
+#define dmac_clean_all __glue(_CACHE,_dma_clean_all)
+#define dmac_flush_all __glue(_CACHE,_dma_flush_all)
extern void dmac_inv_range(const void *, const void *);
extern void dmac_clean_range(const void *, const void *);
extern void dmac_flush_range(const void *, const void *);
+extern void dmac_clean_all(void);
+extern void dmac_flush_all(void);
#endif
@@ -293,6 +303,8 @@ enum smp_dma_cache_type {
SMP_DMA_CACHE_INV,
SMP_DMA_CACHE_CLEAN,
SMP_DMA_CACHE_FLUSH,
+ SMP_DMA_CACHE_CLEAN_ALL,
+ SMP_DMA_CACHE_FLUSH_ALL,
};
extern void smp_dma_cache_op(int type, const void *start, const void *end);
@@ -311,10 +323,23 @@ static inline void smp_dma_flush_range(const void *start, const void *end)
{
smp_dma_cache_op(SMP_DMA_CACHE_FLUSH, start, end);
}
+
+static inline void smp_dma_clean_all(void)
+{
+ smp_dma_cache_op(SMP_DMA_CACHE_CLEAN_ALL, NULL, NULL);
+}
+
+static inline void smp_dma_flush_all(void)
+{
+ smp_dma_cache_op(SMP_DMA_CACHE_FLUSH_ALL, NULL, NULL);
+}
+
#else
#define smp_dma_inv_range dmac_inv_range
#define smp_dma_clean_range dmac_clean_range
#define smp_dma_flush_range dmac_flush_range
+#define smp_dma_clean_all dmac_clean_all
+#define smp_dma_flush_all dmac_flush_all
#endif
#ifdef CONFIG_OUTER_CACHE
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index f130fda2fbbe..40eff61d6d8e 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -751,6 +751,12 @@ static void local_dma_cache_op(int type, const void *start, const void *end)
case SMP_DMA_CACHE_FLUSH:
dmac_flush_range(start, end);
break;
+ case SMP_DMA_CACHE_CLEAN_ALL:
+ dmac_clean_all();
+ break;
+ case SMP_DMA_CACHE_FLUSH_ALL:
+ dmac_flush_all();
+ break;
default:
printk(KERN_CRIT "CPU%u: Unknown SMP DMA cache type %d\n",
smp_processor_id(), type);
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 3668611cb400..f8ff33d66ede 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -145,4 +145,6 @@ ENTRY(v4_cache_fns)
.long v4_dma_inv_range
.long v4_dma_clean_range
.long v4_dma_flush_range
+ .long v4_flush_kern_cache_all
+ .long v4_flush_kern_cache_all
.size v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2ebc1b3bf856..771b978c59df 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -228,4 +228,6 @@ ENTRY(v4wb_cache_fns)
.long v4wb_dma_inv_range
.long v4wb_dma_clean_range
.long v4wb_dma_flush_range
+ .long v4wb_flush_kern_cache_all
+ .long v4wb_flush_kern_cache_all
.size v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index c54fa2cc40e6..1dfe16103e42 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -184,4 +184,6 @@ ENTRY(v4wt_cache_fns)
.long v4wt_dma_inv_range
.long v4wt_dma_clean_range
.long v4wt_dma_flush_range
+ .long v4wt_flush_kern_cache_all
+ .long v4wt_flush_kern_cache_all
.size v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 55f7ecd1264e..12d17c702e16 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -274,6 +274,36 @@ v6_dma_flush_dcache_all:
mov pc, lr
#endif
+/*
+ * v6_dma_clean_all()
+ *
+ * cleans the entire L1 data cache
+ */
+ENTRY(v6_dma_clean_all)
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c10, 0 @ D cache clean
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+
+/*
+ * v6_dma_flush_all()
+ *
+ * flushes the entire L1 data cache
+ */
+ENTRY(v6_dma_flush_all)
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+
+
+
__INITDATA
.type v6_cache_fns, #object
@@ -287,4 +317,6 @@ ENTRY(v6_cache_fns)
.long v6_dma_inv_range
.long v6_dma_clean_range
.long v6_dma_flush_range
+ .long v6_dma_clean_all
+ .long v6_dma_flush_all
.size v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 5327bd1b9bcf..8c2975898da9 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -26,19 +26,19 @@
*
* - mm - mm_struct describing address space
*/
-ENTRY(v7_flush_dcache_all)
+ .macro v7_way_op, op
dmb @ ensure ordering with previous memory accesses
mrc p15, 1, r0, c0, c0, 1 @ read clidr
ands r3, r0, #0x7000000 @ extract loc from clidr
mov r3, r3, lsr #23 @ left align loc bit field
- beq finished @ if loc is 0, then no need to clean
+ beq 50f @ if loc is 0, then no need to clean
mov r10, #0 @ start clean at cache level 0
-loop1:
+10:
add r2, r10, r10, lsr #1 @ work out 3x current cache level
mov r1, r0, lsr r2 @ extract cache type bits from clidr
and r1, r1, #7 @ mask of the bits for current cache only
cmp r1, #2 @ see what cache we have at this level
- blt skip @ skip if no cache, or just i-cache
+ blt 40f @ skip if no cache, or just i-cache
mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr
isb @ isb to sych the new cssr&csidr
mrc p15, 1, r1, c0, c0, 0 @ read the new csidr
@@ -49,32 +49,42 @@ loop1:
clz r5, r4 @ find bit position of way size increment
ldr r7, =0x7fff
ands r7, r7, r1, lsr #13 @ extract max number of the index size
-loop2:
+20:
mov r9, r4 @ create working copy of max way size
-loop3:
+30:
ARM( orr r11, r10, r9, lsl r5 ) @ factor way and cache number into r11
THUMB( lsl r6, r9, r5 )
THUMB( orr r11, r10, r6 ) @ factor way and cache number into r11
ARM( orr r11, r11, r7, lsl r2 ) @ factor index number into r11
THUMB( lsl r6, r7, r2 )
THUMB( orr r11, r11, r6 ) @ factor index number into r11
- mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way
+ mcr p15, 0, r11, c7, \op , 2 @ clean & invalidate by set/way
subs r9, r9, #1 @ decrement the way
- bge loop3
+ bge 30b
subs r7, r7, #1 @ decrement the index
- bge loop2
-skip:
+ bge 20b
+40:
add r10, r10, #2 @ increment cache number
cmp r3, r10
- bgt loop1
-finished:
+ bgt 10b
+50:
mov r10, #0 @ swith back to cache level 0
mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr
dsb
isb
+ .endm
+
+ENTRY(v7_flush_dcache_all)
+ v7_way_op c14
mov pc, lr
ENDPROC(v7_flush_dcache_all)
+
+ENTRY(v7_clean_dcache_all)
+ v7_way_op c10
+ mov pc, lr
+ENDPROC(v7_clean_dcache_all)
+
/*
* v7_flush_cache_all()
*
@@ -266,6 +276,35 @@ ENTRY(v7_dma_flush_range)
mov pc, lr
ENDPROC(v7_dma_flush_range)
+/*
+ * v7_dma_flush_all()
+ *
+ * flushes the entire L1 data cache
+ */
+ENTRY(v7_dma_flush_all)
+ ARM( stmfd sp!, {r4-r5, r7, r9-r11, lr} )
+ THUMB( stmfd sp!, {r4-r7, r9-r11, lr} )
+ bl v7_flush_dcache_all
+ ARM( ldmfd sp!, {r4-r5, r7, r9-r11, lr} )
+ THUMB( ldmfd sp!, {r4-r7, r9-r11, lr} )
+ mov pc, lr
+ENDPROC(v7_dma_flush_all)
+
+/*
+ * v7_dma_clean_all()
+ *
+ * cleans the entire L1 data cache
+ */
+ENTRY(v7_dma_clean_all)
+ ARM( stmfd sp!, {r4-r5, r7, r9-r11, lr} )
+ THUMB( stmfd sp!, {r4-r7, r9-r11, lr} )
+ bl v7_clean_dcache_all
+ ARM( ldmfd sp!, {r4-r5, r7, r9-r11, lr} )
+ THUMB( ldmfd sp!, {r4-r7, r9-r11, lr} )
+ mov pc, lr
+ENDPROC(v7_dma_clean_all)
+
+
__INITDATA
.type v7_cache_fns, #object
@@ -279,4 +318,6 @@ ENTRY(v7_cache_fns)
.long v7_dma_inv_range
.long v7_dma_clean_range
.long v7_dma_flush_range
+ .long v7_dma_clean_all
+ .long v7_dma_flush_all
.size v7_cache_fns, . - v7_cache_fns