From 517ffce4e1a03aea979fe3a18a3dd1761a24fafb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 26 Oct 2012 15:18:37 -0700 Subject: sparc64: Make montmul/montsqr/mpmul usable in 32-bit threads. The Montgomery Multiply, Montgomery Square, and Multiple-Precision Multiply instructions work by loading a combination of the floating point and multiple register windows worth of integer registers with the inputs. These values are 64-bit. But for 32-bit userland processes we only save the low 32-bits of each integer register during a register spill. This is because the register window save area is in the user stack and has a fixed layout. Therefore, the only way to use these instruction in 32-bit mode is to perform the following sequence: 1) Load the top-32bits of a choosen integer register with a sentinel, say "-1". This will be in the outer-most register window. The idea is that we're trying to see if the outer-most register window gets spilled, and thus the 64-bit values were truncated. 2) Load all the inputs for the montmul/montsqr/mpmul instruction, down to the inner-most register window. 3) Execute the opcode. 4) Traverse back up to the outer-most register window. 5) Check the sentinel, if it's still "-1" store the results. Otherwise retry the entire sequence. This retry is extremely troublesome. If you're just unlucky and an interrupt or other trap happens, it'll push that outer-most window to the stack and clear the sentinel when we restore it. We could retry forever and never make forward progress if interrupts arrive at a fast enough rate (consider perf events as one example). So we have do limited retries and fallback to software which is extremely non-deterministic. Luckily it's very straightforward to provide a mechanism to let 32-bit applications use a 64-bit stack. Stacks in 64-bit mode are biased by 2047 bytes, which means that the lowest bit is set in the actual %sp register value. So if we see bit zero set in a 32-bit application's stack we treat it like a 64-bit stack. Runtime detection of such a facility is tricky, and cumbersome at best. For example, just trying to use a biased stack and seeing if it works is hard to recover from (the signal handler will need to use an alt stack, plus something along the lines of longjmp). Therefore, we add a system call to report a bitmask of arch specific features like this in a cheap and less hairy way. With help from Andy Polyakov. Signed-off-by: David S. Miller --- arch/sparc/include/asm/compat.h | 5 +++-- arch/sparc/include/asm/thread_info_64.h | 5 +++++ arch/sparc/include/asm/ttable.h | 24 ++++++++++++++++-------- arch/sparc/include/uapi/asm/unistd.h | 6 +++++- 4 files changed, 29 insertions(+), 11 deletions(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index cef99fbc0a21..830502fe62b4 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -232,9 +232,10 @@ static inline void __user *arch_compat_alloc_user_space(long len) struct pt_regs *regs = current_thread_info()->kregs; unsigned long usp = regs->u_regs[UREG_I6]; - if (!(test_thread_flag(TIF_32BIT))) + if (test_thread_64bit_stack(usp)) usp += STACK_BIAS; - else + + if (test_thread_flag(TIF_32BIT)) usp &= 0xffffffffUL; usp -= len; diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 4e2276631081..a3fe4dcc0aa6 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -259,6 +259,11 @@ static inline bool test_and_clear_restore_sigmask(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define thread32_stack_is_64bit(__SP) (((__SP) & 0x1) != 0) +#define test_thread_64bit_stack(__SP) \ + ((test_thread_flag(TIF_32BIT) && !thread32_stack_is_64bit(__SP)) ? \ + false : true) + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/ttable.h b/arch/sparc/include/asm/ttable.h index 48f2807d3265..71b5a67522ab 100644 --- a/arch/sparc/include/asm/ttable.h +++ b/arch/sparc/include/asm/ttable.h @@ -372,7 +372,9 @@ etrap_spill_fixup_64bit: \ /* Normal 32bit spill */ #define SPILL_2_GENERIC(ASI) \ - srl %sp, 0, %sp; \ + and %sp, 1, %g3; \ + brnz,pn %g3, (. - (128 + 4)); \ + srl %sp, 0, %sp; \ stwa %l0, [%sp + %g0] ASI; \ mov 0x04, %g3; \ stwa %l1, [%sp + %g3] ASI; \ @@ -398,14 +400,16 @@ etrap_spill_fixup_64bit: \ stwa %i6, [%g1 + %g0] ASI; \ stwa %i7, [%g1 + %g3] ASI; \ saved; \ - retry; nop; nop; \ + retry; \ b,a,pt %xcc, spill_fixup_dax; \ b,a,pt %xcc, spill_fixup_mna; \ b,a,pt %xcc, spill_fixup; #define SPILL_2_GENERIC_ETRAP \ etrap_user_spill_32bit: \ - srl %sp, 0, %sp; \ + and %sp, 1, %g3; \ + brnz,pn %g3, etrap_user_spill_64bit; \ + srl %sp, 0, %sp; \ stwa %l0, [%sp + 0x00] %asi; \ stwa %l1, [%sp + 0x04] %asi; \ stwa %l2, [%sp + 0x08] %asi; \ @@ -427,7 +431,7 @@ etrap_user_spill_32bit: \ ba,pt %xcc, etrap_save; \ wrpr %g1, %cwp; \ nop; nop; nop; nop; \ - nop; nop; nop; nop; \ + nop; nop; \ ba,a,pt %xcc, etrap_spill_fixup_32bit; \ ba,a,pt %xcc, etrap_spill_fixup_32bit; \ ba,a,pt %xcc, etrap_spill_fixup_32bit; @@ -592,7 +596,9 @@ user_rtt_fill_64bit: \ /* Normal 32bit fill */ #define FILL_2_GENERIC(ASI) \ - srl %sp, 0, %sp; \ + and %sp, 1, %g3; \ + brnz,pn %g3, (. - (128 + 4)); \ + srl %sp, 0, %sp; \ lduwa [%sp + %g0] ASI, %l0; \ mov 0x04, %g2; \ mov 0x08, %g3; \ @@ -616,14 +622,16 @@ user_rtt_fill_64bit: \ lduwa [%g1 + %g3] ASI, %i6; \ lduwa [%g1 + %g5] ASI, %i7; \ restored; \ - retry; nop; nop; nop; nop; \ + retry; nop; nop; \ b,a,pt %xcc, fill_fixup_dax; \ b,a,pt %xcc, fill_fixup_mna; \ b,a,pt %xcc, fill_fixup; #define FILL_2_GENERIC_RTRAP \ user_rtt_fill_32bit: \ - srl %sp, 0, %sp; \ + and %sp, 1, %g3; \ + brnz,pn %g3, user_rtt_fill_64bit; \ + srl %sp, 0, %sp; \ lduwa [%sp + 0x00] %asi, %l0; \ lduwa [%sp + 0x04] %asi, %l1; \ lduwa [%sp + 0x08] %asi, %l2; \ @@ -643,7 +651,7 @@ user_rtt_fill_32bit: \ ba,pt %xcc, user_rtt_pre_restore; \ restored; \ nop; nop; nop; nop; nop; \ - nop; nop; nop; nop; nop; \ + nop; nop; nop; \ ba,a,pt %xcc, user_rtt_fill_fixup; \ ba,a,pt %xcc, user_rtt_fill_fixup; \ ba,a,pt %xcc, user_rtt_fill_fixup; diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index 8974ef7ae920..bed86a820d09 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h @@ -405,8 +405,12 @@ #define __NR_setns 337 #define __NR_process_vm_readv 338 #define __NR_process_vm_writev 339 +#define __NR_kern_features 340 -#define NR_syscalls 340 +#define NR_syscalls 341 + +/* Bitmask values returned from kern_features system call. */ +#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 #ifdef __32bit_syscall_numbers__ /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants, -- cgit v1.2.3 From 270c10e00a1e557e068803a22e0556281ceb1830 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 27 Oct 2012 18:05:20 -0700 Subject: sparc64: Fix cpu strand yielding. For atomic backoff, we just loop over an exponentially backed off counter. This is extremely ineffective as it doesn't actually yield the cpu strand so that other competing strands can use the cpu core. In cpus previous to SPARC-T4 we have to do this in a slightly hackish way, by doing an operation with no side effects that also happens to mark the strand as unavailable. The mechanism we choose for this is three reads of the %ccr (condition-code) register into %g0 (the zero register). SPARC-T4 has an explicit "pause" instruction, and we'll make use of that in a subsequent commit. Yield strands also in cpu_relax(). We really should have done this a very long time ago. Signed-off-by: David S. Miller --- arch/sparc/include/asm/backoff.h | 5 ++++- arch/sparc/include/asm/processor_64.h | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/backoff.h b/arch/sparc/include/asm/backoff.h index db3af0d30fb1..64b077b3b13b 100644 --- a/arch/sparc/include/asm/backoff.h +++ b/arch/sparc/include/asm/backoff.h @@ -13,7 +13,10 @@ #define BACKOFF_SPIN(reg, tmp, label) \ mov reg, tmp; \ -88: brnz,pt tmp, 88b; \ +88: rd %ccr, %g0; \ + rd %ccr, %g0; \ + rd %ccr, %g0; \ + brnz,pt tmp, 88b; \ sub tmp, 1, tmp; \ set BACKOFF_LIMIT, tmp; \ cmp reg, tmp; \ diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 4e5a483122a0..986563409469 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -196,7 +196,10 @@ extern unsigned long get_wchan(struct task_struct *task); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->tpc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->u_regs[UREG_FP]) -#define cpu_relax() barrier() +#define cpu_relax() asm volatile("rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0" \ + ::: "memory") /* Prefetch support. This is tuned for UltraSPARC-III and later. * UltraSPARC-I will treat these as nops, and UltraSPARC-II has -- cgit v1.2.3 From e9b9eb59ffcdee09ec96b040f85c919618f4043e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 27 Oct 2012 23:00:41 -0700 Subject: sparc64: Use pause instruction when available. In atomic backoff and cpu_relax(), use the pause instruction found on SPARC-T4 and later. It makes the cpu strand unselectable for the given number of cycles, unless an intervening disrupting trap occurs. Signed-off-by: David S. Miller --- arch/sparc/include/asm/backoff.h | 32 +++++++++++++++++++------------- arch/sparc/include/asm/processor_64.h | 13 ++++++++++--- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/backoff.h b/arch/sparc/include/asm/backoff.h index 64b077b3b13b..20f01df0871b 100644 --- a/arch/sparc/include/asm/backoff.h +++ b/arch/sparc/include/asm/backoff.h @@ -11,19 +11,25 @@ #define BACKOFF_LABEL(spin_label, continue_label) \ spin_label -#define BACKOFF_SPIN(reg, tmp, label) \ - mov reg, tmp; \ -88: rd %ccr, %g0; \ - rd %ccr, %g0; \ - rd %ccr, %g0; \ - brnz,pt tmp, 88b; \ - sub tmp, 1, tmp; \ - set BACKOFF_LIMIT, tmp; \ - cmp reg, tmp; \ - bg,pn %xcc, label; \ - nop; \ - ba,pt %xcc, label; \ - sllx reg, 1, reg; +#define BACKOFF_SPIN(reg, tmp, label) \ + mov reg, tmp; \ +88: rd %ccr, %g0; \ + rd %ccr, %g0; \ + rd %ccr, %g0; \ + .section .pause_patch,"ax"; \ + .word 88b; \ + sllx tmp, 7, tmp; \ + wr tmp, 0, %asr27; \ + clr tmp; \ + .previous; \ + brnz,pt tmp, 88b; \ + sub tmp, 1, tmp; \ + set BACKOFF_LIMIT, tmp; \ + cmp reg, tmp; \ + bg,pn %xcc, label; \ + nop; \ + ba,pt %xcc, label; \ + sllx reg, 1, reg; #else diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 986563409469..9cdf52eec48a 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -196,9 +196,16 @@ extern unsigned long get_wchan(struct task_struct *task); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->tpc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->u_regs[UREG_FP]) -#define cpu_relax() asm volatile("rd %%ccr, %%g0\n\t" \ - "rd %%ccr, %%g0\n\t" \ - "rd %%ccr, %%g0" \ +#define cpu_relax() asm volatile("\n99:\n\t" \ + "rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0\n\t" \ + ".section .pause_patch,\"ax\"\n\t"\ + ".word 99b\n\t" \ + "wr %%g0, 128, %%asr27\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + ".previous" \ ::: "memory") /* Prefetch support. This is tuned for UltraSPARC-III and later. -- cgit v1.2.3 From 187818cd6a5ab6343eac47e52da2f3e40c544b98 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 28 Oct 2012 13:04:47 -0700 Subject: sparc64: Improvde documentation and readability of atomic backoff code. Document what's going on in asm/backoff.h with a large and descriptive comment. Refer to it above the cpu_relax() definition in asm/processor_64.h Rename the pause patching section to have "3insn" in it's name like the other patching sections do. Based upon feedback from Sam Ravnborg. Signed-off-by: David S. Miller --- arch/sparc/include/asm/backoff.h | 42 ++++++++++++++++++++++++++++++++++- arch/sparc/include/asm/processor_64.h | 7 +++++- 2 files changed, 47 insertions(+), 2 deletions(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/backoff.h b/arch/sparc/include/asm/backoff.h index 20f01df0871b..4e02086b839c 100644 --- a/arch/sparc/include/asm/backoff.h +++ b/arch/sparc/include/asm/backoff.h @@ -1,6 +1,46 @@ #ifndef _SPARC64_BACKOFF_H #define _SPARC64_BACKOFF_H +/* The macros in this file implement an exponential backoff facility + * for atomic operations. + * + * When multiple threads compete on an atomic operation, it is + * possible for one thread to be continually denied a successful + * completion of the compare-and-swap instruction. Heavily + * threaded cpu implementations like Niagara can compound this + * problem even further. + * + * When an atomic operation fails and needs to be retried, we spin a + * certain number of times. At each subsequent failure of the same + * operation we double the spin count, realizing an exponential + * backoff. + * + * When we spin, we try to use an operation that will cause the + * current cpu strand to block, and therefore make the core fully + * available to any other other runnable strands. There are two + * options, based upon cpu capabilities. + * + * On all cpus prior to SPARC-T4 we do three dummy reads of the + * condition code register. Each read blocks the strand for something + * between 40 and 50 cpu cycles. + * + * For SPARC-T4 and later we have a special "pause" instruction + * available. This is implemented using writes to register %asr27. + * The cpu will block the number of cycles written into the register, + * unless a disrupting trap happens first. SPARC-T4 specifically + * implements pause with a granularity of 8 cycles. Each strand has + * an internal pause counter which decrements every 8 cycles. So the + * chip shifts the %asr27 value down by 3 bits, and writes the result + * into the pause counter. If a value smaller than 8 is written, the + * chip blocks for 1 cycle. + * + * To achieve the same amount of backoff as the three %ccr reads give + * on earlier chips, we shift the backoff value up by 7 bits. (Three + * %ccr reads block for about 128 cycles, 1 << 7 == 128) We write the + * whole amount we want to block into the pause register, rather than + * loop writing 128 each time. + */ + #define BACKOFF_LIMIT (4 * 1024) #ifdef CONFIG_SMP @@ -16,7 +56,7 @@ 88: rd %ccr, %g0; \ rd %ccr, %g0; \ rd %ccr, %g0; \ - .section .pause_patch,"ax"; \ + .section .pause_3insn_patch,"ax";\ .word 88b; \ sllx tmp, 7, tmp; \ wr tmp, 0, %asr27; \ diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 9cdf52eec48a..721e25f0e2ea 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -196,11 +196,16 @@ extern unsigned long get_wchan(struct task_struct *task); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->tpc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->u_regs[UREG_FP]) +/* Please see the commentary in asm/backoff.h for a description of + * what these instructions are doing and how they have been choosen. + * To make a long story short, we are trying to yield the current cpu + * strand during busy loops. + */ #define cpu_relax() asm volatile("\n99:\n\t" \ "rd %%ccr, %%g0\n\t" \ "rd %%ccr, %%g0\n\t" \ "rd %%ccr, %%g0\n\t" \ - ".section .pause_patch,\"ax\"\n\t"\ + ".section .pause_3insn_patch,\"ax\"\n\t"\ ".word 99b\n\t" \ "wr %%g0, 128, %%asr27\n\t" \ "nop\n\t" \ -- cgit v1.2.3 From 1df35f80f9d3bba7cd434b64c9eaff8c9109abad Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 28 Oct 2012 13:15:09 -0700 Subject: sparc: Wire up sys_kcmp. Signed-off-by: David S. Miller --- arch/sparc/include/uapi/asm/unistd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index bed86a820d09..cac719d1bc5c 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h @@ -406,8 +406,9 @@ #define __NR_process_vm_readv 338 #define __NR_process_vm_writev 339 #define __NR_kern_features 340 +#define __NR_kcmp 341 -#define NR_syscalls 341 +#define NR_syscalls 342 /* Bitmask values returned from kern_features system call. */ #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 -- cgit v1.2.3 From 0bce04be442cf4d6e4ba9dac2f0a4c5ee88af5c5 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Tue, 6 Nov 2012 00:12:03 +0000 Subject: of/address: sparc: Declare of_address_to_resource() as an extern function for sparc again This bug-fix makes sure that of_address_to_resource is defined extern for sparc so that the sparc-specific implementation of of_address_to_resource() is once again used when including include/linux/of_address.h in a sparc context. A number of drivers in mainline relies on this function working for sparc. The bug was introduced in a850a7554442f08d3e910c6eeb4ee216868dda1e, "of/address: add empty static inlines for !CONFIG_OF". Contrary to that commit title, the static inlines are added for !CONFIG_OF_ADDRESS, and CONFIG_OF_ADDRESS is never defined for sparc. This is good behavior for the other functions in include/linux/of_address.h, as the extern functions defined in drivers/of/address.c only gets linked when OF_ADDRESS is configured. However, for of_address_to_resource there exists a sparc-specific implementation in arch/sparc/arch/sparc/kernel/of_device_common.c Solution suggested by: Sam Ravnborg Signed-off-by: Andreas Larsson Acked-by: Rob Herring Signed-off-by: David S. Miller --- arch/sparc/include/asm/prom.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h index c28765110706..f93003123bce 100644 --- a/arch/sparc/include/asm/prom.h +++ b/arch/sparc/include/asm/prom.h @@ -63,5 +63,10 @@ extern char *of_console_options; extern void irq_trans_init(struct device_node *dp); extern char *build_path_component(struct device_node *dp); +/* SPARC has a local implementation */ +extern int of_address_to_resource(struct device_node *dev, int index, + struct resource *r); +#define of_address_to_resource of_address_to_resource + #endif /* __KERNEL__ */ #endif /* _SPARC_PROM_H */ -- cgit v1.2.3 From 193d2aadc0ff5c687f6f0d5ef1d38c86ab511a14 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 9 Nov 2012 19:37:59 -0800 Subject: sparc: Support atomic64_dec_if_positive properly. Sparc32 already supported it, as a consequence of using the generic atomic64 implementation. And the sparc64 implementation is rather trivial. This allows us to set ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE for all of sparc, and avoid the annoying warning from lib/atomic64_test.c Signed-off-by: David S. Miller --- arch/sparc/include/asm/atomic_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/sparc/include') diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index ce35a1cf1a20..be56a244c9cf 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -1,7 +1,7 @@ /* atomic.h: Thankfully the V9 is at least reasonable for this * stuff. * - * Copyright (C) 1996, 1997, 2000 David S. Miller (davem@redhat.com) + * Copyright (C) 1996, 1997, 2000, 2012 David S. Miller (davem@redhat.com) */ #ifndef __ARCH_SPARC64_ATOMIC__ @@ -106,6 +106,8 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u) #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) +extern long atomic64_dec_if_positive(atomic64_t *v); + /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() #define smp_mb__after_atomic_dec() barrier() -- cgit v1.2.3