summaryrefslogtreecommitdiff
path: root/arch/tile/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile5
-rw-r--r--arch/tile/lib/atomic_32.c17
-rw-r--r--arch/tile/lib/atomic_asm_32.S2
-rw-r--r--arch/tile/lib/cacheflush.c102
-rw-r--r--arch/tile/lib/delay.c21
-rw-r--r--arch/tile/lib/exports.c10
-rw-r--r--arch/tile/lib/mb_incoherent.S34
-rw-r--r--arch/tile/lib/memcpy_tile64.c4
-rw-r--r--arch/tile/lib/spinlock_32.c161
9 files changed, 234 insertions, 122 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 93122d5b1558..0c26086ecbef 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,9 +2,8 @@
# Makefile for TILE-specific library files..
#
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
- mb_incoherent.o uaccess.o memmove.o \
- memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
+ memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
strchr_$(BITS).o strlen_$(BITS).o
ifeq ($(CONFIG_TILEGX),y)
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 7a5cc706ab62..46570211df52 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
/* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
- __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
static inline int *__atomic_hashed_lock(volatile void *v)
{
- /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
+ /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
unsigned long i =
(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
@@ -203,32 +202,32 @@ static inline int *__futex_setup(int __user *v)
return __atomic_hashed_lock((int __force *)v);
}
-struct __get_user futex_set(int __user *v, int i)
+struct __get_user futex_set(u32 __user *v, int i)
{
return __atomic_xchg((int __force *)v, __futex_setup(v), i);
}
-struct __get_user futex_add(int __user *v, int n)
+struct __get_user futex_add(u32 __user *v, int n)
{
return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
}
-struct __get_user futex_or(int __user *v, int n)
+struct __get_user futex_or(u32 __user *v, int n)
{
return __atomic_or((int __force *)v, __futex_setup(v), n);
}
-struct __get_user futex_andn(int __user *v, int n)
+struct __get_user futex_andn(u32 __user *v, int n)
{
return __atomic_andn((int __force *)v, __futex_setup(v), n);
}
-struct __get_user futex_xor(int __user *v, int n)
+struct __get_user futex_xor(u32 __user *v, int n)
{
return __atomic_xor((int __force *)v, __futex_setup(v), n);
}
-struct __get_user futex_cmpxchg(int __user *v, int o, int n)
+struct __get_user futex_cmpxchg(u32 __user *v, int o, int n)
{
return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
}
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e78..82f64cc63658 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
* Support routines for atomic operations. Each function takes:
*
* r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
* r2: new value to write, or for cmpxchg/add_unless, value to compare against
* r3: (cmpxchg/xchg_add_unless) new value to write or add;
* (atomic64 ops) high word of value to write
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c2097..35c1d8ca5f38 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -21,3 +21,105 @@ void __flush_icache_range(unsigned long start, unsigned long end)
{
invalidate_icache((const void *)start, end - start, PAGE_SIZE);
}
+
+
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+ *(volatile char *)p;
+}
+
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+ char *p, *base;
+ size_t step_size, load_count;
+ const unsigned long STRIPE_WIDTH = 8192;
+
+ /*
+ * Flush and invalidate the buffer out of the local L1/L2
+ * and request the home cache to flush and invalidate as well.
+ */
+ __finv_buffer(buffer, size);
+
+ /*
+ * Wait for the home cache to acknowledge that it has processed
+ * all the flush-and-invalidate requests. This does not mean
+ * that the flushed data has reached the memory controller yet,
+ * but it does mean the home cache is processing the flushes.
+ */
+ __insn_mf();
+
+ /*
+ * Issue a load to the last cache line, which can't complete
+ * until all the previously-issued flushes to the same memory
+ * controller have also completed. If we weren't striping
+ * memory, that one load would be sufficient, but since we may
+ * be, we also need to back up to the last load issued to
+ * another memory controller, which would be the point where
+ * we crossed an 8KB boundary (the granularity of striping
+ * across memory controllers). Keep backing up and doing this
+ * until we are before the beginning of the buffer, or have
+ * hit all the controllers.
+ *
+ * If we are flushing a hash-for-home buffer, it's even worse.
+ * Each line may be homed on a different tile, and each tile
+ * may have up to four lines that are on different
+ * controllers. So as we walk backwards, we have to touch
+ * enough cache lines to satisfy these constraints. In
+ * practice this ends up being close enough to "load from
+ * every cache line on a full memory stripe on each
+ * controller" that we simply do that, to simplify the logic.
+ *
+ * FIXME: See bug 9535 for some issues with this code.
+ */
+ if (hfh) {
+ step_size = L2_CACHE_BYTES;
+ load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+ (1 << CHIP_LOG_NUM_MSHIMS());
+ } else {
+ step_size = STRIPE_WIDTH;
+ load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+ }
+
+ /* Load the last byte of the buffer. */
+ p = (char *)buffer + size - 1;
+ force_load(p);
+
+ /* Bump down to the end of the previous stripe or cache line. */
+ p -= step_size;
+ p = (char *)((unsigned long)p | (step_size - 1));
+
+ /* Figure out how far back we need to go. */
+ base = p - (step_size * (load_count - 2));
+ if ((long)base < (long)buffer)
+ base = buffer;
+
+ /*
+ * Fire all the loads we need. The MAF only has eight entries
+ * so we can have at most eight outstanding loads, so we
+ * unroll by that amount.
+ */
+#pragma unroll 8
+ for (; p >= base; p -= step_size)
+ force_load(p);
+
+ /*
+ * Repeat, but with inv's instead of loads, to get rid of the
+ * data we just loaded into our own cache and the old home L3.
+ * No need to unroll since inv's don't target a register.
+ */
+ p = (char *)buffer + size - 1;
+ __insn_inv(p);
+ p -= step_size;
+ p = (char *)((unsigned long)p | (step_size - 1));
+ for (; p >= base; p -= step_size)
+ __insn_inv(p);
+
+ /* Wait for the load+inv's (and thus finvs) to have completed. */
+ __insn_mf();
+}
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13ef..cdacdd11d360 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
#include <linux/module.h>
#include <linux/delay.h>
#include <linux/thread_info.h>
-#include <asm/fixmap.h>
-#include <hv/hypervisor.h>
+#include <asm/timex.h>
void __udelay(unsigned long usecs)
{
- hv_nanosleep(usecs * 1000);
+ if (usecs > ULONG_MAX / 1000) {
+ WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+ usecs = ULONG_MAX / 1000;
+ }
+ __ndelay(usecs * 1000);
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
- hv_nanosleep(nsecs);
+ cycles_t target = get_cycles();
+ target += ns2cycles(nsecs);
+ while (get_cycles() < target)
+ cpu_relax();
}
EXPORT_SYMBOL(__ndelay);
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+ cycles_t target = get_cycles() + cycles;
+ while (get_cycles() < target)
+ cpu_relax();
+}
EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 1509c5597653..49284fae9d09 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(strnlen_user_asm);
EXPORT_SYMBOL(strncpy_from_user_asm);
EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(inv_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
/* arch/tile/kernel/entry.S */
#include <linux/kernel.h>
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
EXPORT_SYMBOL(__copy_in_user_inatomic);
#endif
-/* arch/tile/lib/mb_incoherent.S */
-EXPORT_SYMBOL(__mb_incoherent);
-
/* hypervisor glue */
#include <hv/hypervisor.h>
EXPORT_SYMBOL(hv_dev_open);
@@ -85,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
EXPORT_SYMBOL(__muldi3);
uint64_t __lshrdi3(uint64_t, unsigned int);
EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
#endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5a..000000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for
- * more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
- moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
- swint2
- jrp lr
- STD_ENDPROC(__mb_incoherent)
-
-#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index f7d4a6ad61e8..b2fe15e01075 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -96,7 +96,7 @@ static void memcpy_multicache(void *dest, const void *source,
newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
ptep = pte_offset_kernel(pmdp, newsrc);
- *ptep = src_pte; /* set_pte() would be confused by this */
+ __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
/* Actually move the data. */
@@ -109,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
*/
src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
- *ptep = src_pte; /* set_pte() would be confused by this */
+ __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
/*
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 5cd1c4004eca..cb0999fb64b4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/module.h>
#include <asm/processor.h>
+#include <arch/spr_def.h>
#include "spinlock_common.h"
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
-/* Lock the word, spinning until there are no tns-ers. */
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
-{
- u32 iterations = 0;
- for (;;) {
- u32 val = __insn_tns((int *)&rwlock->lock);
- if (unlikely(val & 1)) {
- delay_backoff(iterations++);
- continue;
- }
- return val;
- }
-}
-
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
- u32 val = get_rwlock(rwlock);
- int locked = (val << RD_COUNT_WIDTH) == 0;
- rwlock->lock = val + (locked << RD_COUNT_SHIFT);
- return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
- u32 val = get_rwlock(rwlock);
- rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We can get the read lock if everything but the reader bits (which
+ * are in the high part of the word) is zero, i.e. no active or
+ * waiting writers, no tns.
+ *
+ * We guard the tns/store-back with an interrupt critical section to
+ * preserve the semantic that the same read lock can be acquired in an
+ * interrupt context.
+ */
+inline int arch_read_trylock(arch_rwlock_t *rwlock)
{
- u32 eq, mask = 1 << WR_CURR_SHIFT;
- while (unlikely(val & 1)) {
- /* Limited backoff since we are the highest-priority task. */
- relax(4);
- val = __insn_tns((int *)&rwlock->lock);
+ u32 val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+ val = __insn_tns((int *)&rwlock->lock);
+ if (likely((val << _RD_COUNT_WIDTH) == 0)) {
+ val += 1 << RD_COUNT_SHIFT;
+ rwlock->lock = val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ BUG_ON(val == 0); /* we don't expect wraparound */
+ return 1;
}
- val = __insn_addb(val, mask);
- eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
- val = __insn_mz(eq & mask, val);
- rwlock->lock = val;
+ if ((val & 1) == 0)
+ rwlock->lock = val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ return 0;
}
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
/*
- * We spin until everything but the reader bits (which are in the high
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
+ * Spin doing arch_read_trylock() until we acquire the lock.
* ISSUE: This approach can permanently starve readers. A reader who sees
* a writer could instead take a ticket lock (just like a writer would),
* and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
* in a finite time.
*/
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
{
u32 iterations = 0;
- do {
- if (!(val & 1))
- rwlock->lock = val;
+ while (unlikely(!arch_read_trylock(rwlock)))
delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+ u32 val, iterations = 0;
+
+ mb(); /* guarantee anything modified under the lock is visible */
+ for (;;) {
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
val = __insn_tns((int *)&rwlock->lock);
- } while ((val << RD_COUNT_WIDTH) != 0);
- rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+ if (likely(val & 1) == 0) {
+ rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ break;
+ }
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ delay_backoff(iterations++);
+ }
}
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
{
/*
* The trailing underscore on this variable (and curr_ below)
@@ -168,6 +169,12 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
*/
u32 my_ticket_;
u32 iterations = 0;
+ u32 val = __insn_tns((int *)&rwlock->lock);
+
+ if (likely(val == 0)) {
+ rwlock->lock = 1 << _WR_NEXT_SHIFT;
+ return;
+ }
/*
* Wait until there are no readers, then bump up the next
@@ -206,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
relax(4);
}
}
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
{
- int ret;
- u32 iterations = 0;
+ u32 val = __insn_tns((int *)&rwlock->lock);
- BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
- __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+ /*
+ * If a tns is in progress, or there's a waiting or active locker,
+ * or active readers, we can't take the lock, so give up.
+ */
+ if (unlikely(val != 0)) {
+ if (!(val & 1))
+ rwlock->lock = val;
+ return 0;
+ }
- while ((ret = __insn_tns((void *)&lock->counter)) == 1)
- delay_backoff(iterations++);
- return ret;
+ /* Set the "next" field to mark it locked. */
+ rwlock->lock = 1 << _WR_NEXT_SHIFT;
+ return 1;
}
+EXPORT_SYMBOL(arch_write_trylock);
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
{
- p->counter = v;
- __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ u32 val, eq, mask;
+
+ mb(); /* guarantee anything modified under the lock is visible */
+ val = __insn_tns((int *)&rwlock->lock);
+ if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+ rwlock->lock = 0;
+ return;
+ }
+ while (unlikely(val & 1)) {
+ /* Limited backoff since we are the highest-priority task. */
+ relax(4);
+ val = __insn_tns((int *)&rwlock->lock);
+ }
+ mask = 1 << WR_CURR_SHIFT;
+ val = __insn_addb(val, mask);
+ eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+ val = __insn_mz(eq & mask, val);
+ rwlock->lock = val;
}
+EXPORT_SYMBOL(arch_write_unlock);