From bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:35:05 +1000 Subject: powerpc/64: Avoid restore_math call if possible in syscall exit The syscall exit code that branches to restore_math is quite heavy on Book3S, consisting of 2 mtmsr instructions. Threads that don't use both FP and vector can get caught here if the kernel ever uses FP or vector. Lazy-FP/vec context switching also trips this case. So check for lazy FP and vector before switching RI for restore_math. Move most of this case out of line. For threads that do want to restore math registers, the MSR switches are still suboptimal. Future direction may be to use a soft-RI bit to avoid MSR switches in kernel (similar to soft-EE), but for now at least the no-restore POWER9 context switch rate increases by about 5% due to sched_yield(2) return performance. I haven't constructed a test to measure the syscall cost. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc/kernel/process.c') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index baae104b16c7..5cbb8b1faf7e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; + /* + * Syscall exit makes a similar initial check before branching + * to restore_math. Keep them in synch. + */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; -- cgit v1.2.3 From e4c0fc5f72bca11432297168338aef46c12793a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:06 +1000 Subject: powerpc/64s: Leave interrupts hard enabled in context switch for radix Commit 4387e9ff25 ("[POWERPC] Fix PMU + soft interrupt disable bug") hard disabled interrupts over the low level context switch, because the SLB management can't cope with a PMU interrupt accesing the stack in that window. Radix based kernel mapping does not use the SLB so it does not require interrupts hard disabled here. This is worth 1-2% in context switch performance on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel/process.c') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5cbb8b1faf7e..45faa9a32a01 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1199,12 +1199,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after -- cgit v1.2.3 From 07d2a628bc0008f90754ac7982289f6cb0f46cf8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:09 +1000 Subject: powerpc/64s: Avoid cpabort in context switch when possible The ISA v3.0B copy-paste facility only requires cpabort when switching to a process that has foreign real addresses mapped (direct access to accelerators), to clear a potential copy buffer filled by a previous thread. There is no accelerator driver implemented yet, so cpabort can be removed. It can be be re-added when a driver is implemented. POWER9 DD1 requires the copy buffer to always be cleared on context switch, but if accelerators are not in use, then an unpaired copy from a dummy region is sufficient to clear data out of the copy buffer. This increases context switch performance by about 5% on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel/process.c') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 45faa9a32a01..6273b5d5baec 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1137,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1226,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; -- cgit v1.2.3