diff options
| author | Eric Biggers <ebiggers@kernel.org> | 2025-07-12 16:23:04 -0700 |
|---|---|---|
| committer | Eric Biggers <ebiggers@kernel.org> | 2025-07-14 11:28:35 -0700 |
| commit | f3d6cb3dc0394b866bc0d1e15157ce45844cf3d3 (patch) | |
| tree | 49d272f7f91c2cb5dfe9339107b8b487aa47f664 /arch/x86/crypto | |
| parent | c751059985e02467c7fa6b14676c1d56d089b3cc (diff) | |
lib/crypto: x86/sha1: Migrate optimized code into library
Instead of exposing the x86-optimized SHA-1 code via x86-specific
crypto_shash algorithms, instead just implement the sha1_blocks()
library function. This is much simpler, it makes the SHA-1 library
functions be x86-optimized, and it fixes the longstanding issue where
the x86-optimized SHA-1 code was disabled by default. SHA-1 still
remains available through crypto_shash, but individual architectures no
longer need to handle it.
To match sha1_blocks(), change the type of the nblocks parameter of the
assembly functions from int to size_t. The assembly functions actually
already treated it as size_t.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-14-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Diffstat (limited to 'arch/x86/crypto')
| -rw-r--r-- | arch/x86/crypto/Kconfig | 14 | ||||
| -rw-r--r-- | arch/x86/crypto/Makefile | 3 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 700 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ni_asm.S | 304 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ssse3_asm.S | 554 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 324 |
6 files changed, 0 insertions, 1899 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index eb641a300154..94016c60561e 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -376,20 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_SHA1_SSSE3 - tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on 64BIT - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d31348be8370..d402963d6b57 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -51,9 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif -obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S deleted file mode 100644 index 4b49bdc95265..000000000000 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Implement fast SHA-1 with AVX2 instructions. (x86_64) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Ilya Albrekht <ilya.albrekht@intel.com> - * Maxim Locktyukhin <maxim.locktyukhin@intel.com> - * Ronen Zohar <ronen.zohar@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. - * - *This implementation is based on the previous SSSE3 release: - *Visit http://software.intel.com/en-us/articles/ - *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ - * - *Updates 20-byte SHA-1 record at start of 'state', from 'input', for - *even number of 'blocks' consecutive 64-byte blocks. - * - *extern "C" void sha1_transform_avx2( - * struct sha1_state *state, const u8* input, int blocks ); - */ - -#include <linux/linkage.h> - -#define CTX %rdi /* arg1 */ -#define BUF %rsi /* arg2 */ -#define CNT %rdx /* arg3 */ - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %eax -#define REG_E %edx -#define REG_TB %ebx -#define REG_TA %r12d -#define REG_RA %rcx -#define REG_RB %rsi -#define REG_RC %rdi -#define REG_RD %rax -#define REG_RE %rdx -#define REG_RTA %r12 -#define REG_RTB %rbx -#define REG_T1 %r11d -#define xmm_mov vmovups -#define avx2_zeroupper vzeroupper -#define RND_F1 1 -#define RND_F2 2 -#define RND_F3 3 - -.macro REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set TB, REG_TB - .set TA, REG_TA - - .set RA, REG_RA - .set RB, REG_RB - .set RC, REG_RC - .set RD, REG_RD - .set RE, REG_RE - - .set RTA, REG_RTA - .set RTB, REG_RTB - - .set T1, REG_T1 -.endm - -#define HASH_PTR %r9 -#define BLOCKS_CTR %r8 -#define BUFFER_PTR %r10 -#define BUFFER_PTR2 %r13 - -#define PRECALC_BUF %r14 -#define WK_BUF %r15 - -#define W_TMP %xmm0 -#define WY_TMP %ymm0 -#define WY_TMP2 %ymm9 - -# AVX2 variables -#define WY0 %ymm3 -#define WY4 %ymm5 -#define WY08 %ymm7 -#define WY12 %ymm8 -#define WY16 %ymm12 -#define WY20 %ymm13 -#define WY24 %ymm14 -#define WY28 %ymm15 - -#define YMM_SHUFB_BSWAP %ymm10 - -/* - * Keep 2 iterations precalculated at a time: - * - 80 DWORDs per iteration * 2 - */ -#define W_SIZE (80*2*2 +16) - -#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) -#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) - - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -.macro PRECALC_RESET_WY - .set WY_00, WY0 - .set WY_04, WY4 - .set WY_08, WY08 - .set WY_12, WY12 - .set WY_16, WY16 - .set WY_20, WY20 - .set WY_24, WY24 - .set WY_28, WY28 - .set WY_32, WY_00 -.endm - -.macro PRECALC_ROTATE_WY - /* Rotate macros */ - .set WY_32, WY_28 - .set WY_28, WY_24 - .set WY_24, WY_20 - .set WY_20, WY_16 - .set WY_16, WY_12 - .set WY_12, WY_08 - .set WY_08, WY_04 - .set WY_04, WY_00 - .set WY_00, WY_32 - - /* Define register aliases */ - .set WY, WY_00 - .set WY_minus_04, WY_04 - .set WY_minus_08, WY_08 - .set WY_minus_12, WY_12 - .set WY_minus_16, WY_16 - .set WY_minus_20, WY_20 - .set WY_minus_24, WY_24 - .set WY_minus_28, WY_28 - .set WY_minus_32, WY -.endm - -.macro PRECALC_00_15 - .if (i == 0) # Initialize and rotate registers - PRECALC_RESET_WY - PRECALC_ROTATE_WY - .endif - - /* message scheduling pre-compute for rounds 0-15 */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vmovdqu (i * 2)(BUFFER_PTR), W_TMP - .elseif ((i & 7) == 1) - vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ - WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY - .elseif ((i & 7) == 4) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - .elseif ((i & 7) == 7) - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_16_31 - /* - * message scheduling pre-compute for rounds 16-31 - * calculating last 32 w[i] values in 8 XMM registers - * pre-calculate K+w[i] values and store to mem - * for later load by ALU add instruction - * - * "brute force" vectorization for rounds 16-31 only - * due to w[i]->w[i-3] dependency - */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - /* w[i-14] */ - vpalignr $8, WY_minus_16, WY_minus_12, WY - vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ - .elseif ((i & 7) == 1) - vpxor WY_minus_08, WY, WY - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpxor WY_TMP, WY, WY - vpslldq $12, WY, WY_TMP2 - .elseif ((i & 7) == 3) - vpslld $1, WY, WY_TMP - vpsrld $31, WY, WY - .elseif ((i & 7) == 4) - vpor WY, WY_TMP, WY_TMP - vpslld $2, WY_TMP2, WY - .elseif ((i & 7) == 5) - vpsrld $30, WY_TMP2, WY_TMP2 - vpxor WY, WY_TMP, WY_TMP - .elseif ((i & 7) == 7) - vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_32_79 - /* - * in SHA-1 specification: - * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: - * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization - * since w[i]=>w[i-3] dependency is broken - */ - - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP - .elseif ((i & 7) == 1) - /* W is W_minus_32 before xor */ - vpxor WY_minus_28, WY, WY - .elseif ((i & 7) == 2) - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 3) - vpxor WY_TMP, WY, WY - .elseif ((i & 7) == 4) - vpslld $2, WY, WY_TMP - .elseif ((i & 7) == 5) - vpsrld $30, WY, WY - vpor WY, WY_TMP, WY - .elseif ((i & 7) == 7) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC r, s - .set i, \r - - .if (i < 40) - .set K_XMM, 32*0 - .elseif (i < 80) - .set K_XMM, 32*1 - .elseif (i < 120) - .set K_XMM, 32*2 - .else - .set K_XMM, 32*3 - .endif - - .if (i<32) - PRECALC_00_15 \s - .elseif (i<64) - PRECALC_16_31 \s - .elseif (i < 160) - PRECALC_32_79 \s - .endif -.endm - -.macro ROTATE_STATE - .set T_REG, E - .set E, D - .set D, C - .set C, B - .set B, TB - .set TB, A - .set A, T_REG - - .set T_REG, RE - .set RE, RD - .set RD, RC - .set RC, RB - .set RB, RTB - .set RTB, RA - .set RA, T_REG -.endm - -/* Macro relies on saved ROUND_Fx */ - -.macro RND_FUN f, r - .if (\f == RND_F1) - ROUND_F1 \r - .elseif (\f == RND_F2) - ROUND_F2 \r - .elseif (\f == RND_F3) - ROUND_F3 \r - .endif -.endm - -.macro RR r - .set round_id, (\r % 80) - - .if (round_id == 0) /* Precalculate F for first round */ - .set ROUND_FUNC, RND_F1 - mov B, TB - - rorx $(32-30), B, B /* b>>>2 */ - andn D, TB, T1 - and C, TB - xor T1, TB - .endif - - RND_FUN ROUND_FUNC, \r - ROTATE_STATE - - .if (round_id == 18) - .set ROUND_FUNC, RND_F2 - .elseif (round_id == 38) - .set ROUND_FUNC, RND_F3 - .elseif (round_id == 58) - .set ROUND_FUNC, RND_F2 - .endif - - .set round_id, ( (\r+1) % 80) - - RND_FUN ROUND_FUNC, (\r+1) - ROTATE_STATE -.endm - -.macro ROUND_F1 r - add WK(\r), E - - andn C, A, T1 /* ~b&d */ - lea (RE,RTB), E /* Add F from the previous round */ - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30),A, TB /* b>>>2 for next round */ - - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - /* - * Calculate F for the next round - * (b & c) ^ andn[b, d] - */ - and B, A /* b&c */ - xor T1, A /* F1 = (b&c) ^ (~b&d) */ - - lea (RE,RTA), E /* E += A >>> 5 */ -.endm - -.macro ROUND_F2 r - add WK(\r), E - lea (RE,RTB), E /* Add F from the previous round */ - - /* Calculate F for the next round */ - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - .if ((round_id) < 79) - rorx $(32-30), A, TB /* b>>>2 for next round */ - .endif - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - .if ((round_id) < 79) - xor B, A - .endif - - add TA, E /* E += A >>> 5 */ - - .if ((round_id) < 79) - xor C, A - .endif -.endm - -.macro ROUND_F3 r - add WK(\r), E - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - lea (RE,RTB), E /* Add F from the previous round */ - - mov B, T1 - or A, T1 - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30), A, TB /* b>>>2 for next round */ - - /* Calculate F for the next round - * (b and c) or (d and (b or c)) - */ - and C, T1 - and B, A - or T1, A - - add TA, E /* E += A >>> 5 */ - -.endm - -/* Add constant only if (%2 > %3) condition met (uses RTA as temp) - * %1 + %2 >= %3 ? %4 : 0 - */ -.macro ADD_IF_GE a, b, c, d - mov \a, RTA - add $\d, RTA - cmp $\c, \b - cmovge RTA, \a -.endm - -/* - * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining - */ -.macro SHA1_PIPELINED_MAIN_BODY - - REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - mov %rsp, PRECALC_BUF - lea (2*4*80+32)(%rsp), WK_BUF - - # Precalc WK for first 2 blocks - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 - .set i, 0 - .rept 160 - PRECALC i - .set i, i + 1 - .endr - - /* Go to next block if needed */ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - xchg WK_BUF, PRECALC_BUF - - .align 32 -.L_loop: - /* - * code loops through more than one block - * we use K_BASE value as a signal of a last block, - * it is set below by: cmovae BUFFER_PTR, K_BASE - */ - test BLOCKS_CTR, BLOCKS_CTR - jnz .L_begin - .align 32 - jmp .L_end - .align 32 -.L_begin: - - /* - * Do first block - * rounds: 0,2,4,6,8 - */ - .set j, 0 - .rept 5 - RR j - .set j, j+2 - .endr - - /* - * rounds: - * 10,12,14,16,18 - * 20,22,24,26,28 - * 30,32,34,36,38 - * 40,42,44,46,48 - * 50,52,54,56,58 - */ - .rept 25 - RR j - .set j, j+2 - .endr - - /* Update Counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 - /* - * rounds - * 60,62,64,66,68 - * 70,72,74,76,78 - */ - .rept 10 - RR j - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - test BLOCKS_CTR, BLOCKS_CTR - jz .L_loop - - mov TB, B - - /* Process second block */ - /* - * rounds - * 0+80, 2+80, 4+80, 6+80, 8+80 - * 10+80,12+80,14+80,16+80,18+80 - */ - - .set j, 0 - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 20+80,22+80,24+80,26+80,28+80 - * 30+80,32+80,34+80,36+80,38+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 40+80,42+80,44+80,46+80,48+80 - * 50+80,52+80,54+80,56+80,58+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* update counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - - /* - * rounds - * 60+80,62+80,64+80,66+80,68+80 - * 70+80,72+80,74+80,76+80,78+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - /* Reset state for AVX2 reg permutation */ - mov A, TA - mov TB, A - mov C, TB - mov E, C - mov D, B - mov TA, D - - REGALLOC - - xchg WK_BUF, PRECALC_BUF - - jmp .L_loop - - .align 32 -.L_end: - -.endm -/* - * macro implements SHA-1 function's body for several 64-byte blocks - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) - - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - RESERVE_STACK = (W_SIZE*4 + 8+24) - - /* Align stack */ - push %rbp - mov %rsp, %rbp - and $~(0x20-1), %rsp - sub $RESERVE_STACK, %rsp - - avx2_zeroupper - - /* Setup initial values */ - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - mov BUF, BUFFER_PTR2 - mov CNT, BLOCKS_CTR - - xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - avx2_zeroupper - - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - RET - - SYM_FUNC_END(\name) -.endm - -.section .rodata - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.align 128 -K_XMM_AR: - .long K1, K1, K1, K1 - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f -.text - -SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S deleted file mode 100644 index cade913d4882..000000000000 --- a/arch/x86/crypto/sha1_ni_asm.S +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -/* gcc conversion */ -#define FRAME_SIZE 32 /* space for 2x16 bytes */ - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define SHUF_MASK %xmm7 - - -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha1_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks) - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ -.text -SYM_TYPED_FUNC_START(sha1_ni_transform) - push %rbp - mov %rsp, %rbp - sub $FRAME_SIZE, %rsp - and $~0xF, %rsp - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* load initial hash values */ - pinsrd $3, 1*16(DIGEST_PTR), E0 - movdqu 0*16(DIGEST_PTR), ABCD - pand UPPER_WORD_MASK(%rip), E0 - pshufd $0x1B, ABCD, ABCD - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa E0, (0*16)(%rsp) - movdqa ABCD, (1*16)(%rsp) - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG0 - pshufb SHUF_MASK, MSG0 - paddd MSG0, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG1 - pshufb SHUF_MASK, MSG1 - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG2 - pshufb SHUF_MASK, MSG2 - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG3 - pshufb SHUF_MASK, MSG3 - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - pxor MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte (0*16)(%rsp), E0 - paddd (1*16)(%rsp), ABCD - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, ABCD, ABCD - movdqu ABCD, 0*16(DIGEST_PTR) - pextrd $3, E0, 1*16(DIGEST_PTR) - -.Ldone_hash: - mov %rbp, %rsp - pop %rbp - - RET -SYM_FUNC_END(sha1_ni_transform) - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 -.align 16 -UPPER_WORD_MASK: - .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S deleted file mode 100644 index f54988c80eb4..000000000000 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ /dev/null @@ -1,554 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental - * SSE3 instruction set extensions introduced in Intel Core Microarchitecture - * processors. CPUs supporting Intel(R) AVX extensions will get an additional - * boost. - * - * This work was inspired by the vectorized implementation of Dean Gaudet. - * Additional information on it can be found at: - * http://www.arctic.org/~dean/crypto/sha1.html - * - * It was improved upon with more efficient vectorization of the message - * scheduling. This implementation has also been optimized for all current and - * several future generations of Intel CPUs. - * - * See this article for more information about the implementation details: - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ - * - * Copyright (C) 2010, Intel Corp. - * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> - * Ronen Zohar <ronen.zohar@intel.com> - * - * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: - * Author: Mathias Krause <minipli@googlemail.com> - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define CTX %rdi // arg1 -#define BUF %rsi // arg2 -#define CNT %rdx // arg3 - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %r12d -#define REG_E %edx - -#define REG_T1 %eax -#define REG_T2 %ebx - -#define K_BASE %r8 -#define HASH_PTR %r9 -#define BUFFER_PTR %r10 -#define BUFFER_END %r11 - -#define W_TMP1 %xmm0 -#define W_TMP2 %xmm9 - -#define W0 %xmm1 -#define W4 %xmm2 -#define W8 %xmm3 -#define W12 %xmm4 -#define W16 %xmm5 -#define W20 %xmm6 -#define W24 %xmm7 -#define W28 %xmm8 - -#define XMM_SHUFB_BSWAP %xmm10 - -/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ -#define WK(t) (((t) & 15) * 4)(%rsp) -#define W_PRECALC_AHEAD 16 - -/* - * This macro implements the SHA-1 function's body for single 64-byte block - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_TYPED_FUNC_START(\name) - - push %rbx - push %r12 - push %rbp - mov %rsp, %rbp - - sub $64, %rsp # allocate workspace - and $~15, %rsp # align stack - - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - shl $6, CNT # multiply by 64 - add BUF, CNT - mov CNT, BUFFER_END - - lea K_XMM_AR(%rip), K_BASE - xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - # cleanup workspace - mov $8, %ecx - mov %rsp, %rdi - xor %eax, %eax - rep stosq - - mov %rbp, %rsp # deallocate workspace - pop %rbp - pop %r12 - pop %rbx - RET - - SYM_FUNC_END(\name) -.endm - -/* - * This macro implements 80 rounds of SHA-1 for one 64-byte block - */ -.macro SHA1_PIPELINED_MAIN_BODY - INIT_REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - .set i, 0 - .rept W_PRECALC_AHEAD - W_PRECALC i - .set i, (i+1) - .endr - -.align 4 -1: - RR F1,A,B,C,D,E,0 - RR F1,D,E,A,B,C,2 - RR F1,B,C,D,E,A,4 - RR F1,E,A,B,C,D,6 - RR F1,C,D,E,A,B,8 - - RR F1,A,B,C,D,E,10 - RR F1,D,E,A,B,C,12 - RR F1,B,C,D,E,A,14 - RR F1,E,A,B,C,D,16 - RR F1,C,D,E,A,B,18 - - RR F2,A,B,C,D,E,20 - RR F2,D,E,A,B,C,22 - RR F2,B,C,D,E,A,24 - RR F2,E,A,B,C,D,26 - RR F2,C,D,E,A,B,28 - - RR F2,A,B,C,D,E,30 - RR F2,D,E,A,B,C,32 - RR F2,B,C,D,E,A,34 - RR F2,E,A,B,C,D,36 - RR F2,C,D,E,A,B,38 - - RR F3,A,B,C,D,E,40 - RR F3,D,E,A,B,C,42 - RR F3,B,C,D,E,A,44 - RR F3,E,A,B,C,D,46 - RR F3,C,D,E,A,B,48 - - RR F3,A,B,C,D,E,50 - RR F3,D,E,A,B,C,52 - RR F3,B,C,D,E,A,54 - RR F3,E,A,B,C,D,56 - RR F3,C,D,E,A,B,58 - - add $64, BUFFER_PTR # move to the next 64-byte block - cmp BUFFER_END, BUFFER_PTR # if the current is the last one use - cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun - - RR F4,A,B,C,D,E,60 - RR F4,D,E,A,B,C,62 - RR F4,B,C,D,E,A,64 - RR F4,E,A,B,C,D,66 - RR F4,C,D,E,A,B,68 - - RR F4,A,B,C,D,E,70 - RR F4,D,E,A,B,C,72 - RR F4,B,C,D,E,A,74 - RR F4,E,A,B,C,D,76 - RR F4,C,D,E,A,B,78 - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), B - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - RESTORE_RENAMED_REGS - cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end - jne 1b -.endm - -.macro INIT_REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set T1, REG_T1 - .set T2, REG_T2 -.endm - -.macro RESTORE_RENAMED_REGS - # order is important (REG_C is where it should be) - mov B, REG_B - mov D, REG_D - mov A, REG_A - mov E, REG_E -.endm - -.macro SWAP_REG_NAMES a, b - .set _T, \a - .set \a, \b - .set \b, _T -.endm - -.macro F1 b, c, d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - xor \d, T1 - and \b, T1 - xor \d, T1 -.endm - -.macro F2 b, c, d - mov \d, T1 - SWAP_REG_NAMES \d, T1 - xor \c, T1 - xor \b, T1 -.endm - -.macro F3 b, c ,d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - mov \b, T2 - or \b, T1 - and \c, T2 - and \d, T1 - or T2, T1 -.endm - -.macro F4 b, c, d - F2 \b, \c, \d -.endm - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -/* - * RR does two rounds of SHA-1 back to back with W[] pre-calc - * t1 = F(b, c, d); e += w(i) - * e += t1; b <<= 30; d += w(i+1); - * t1 = F(a, b, c); - * d += t1; a <<= 5; - * e += a; - * t1 = e; a >>= 7; - * t1 <<= 5; - * d += t1; - */ -.macro RR F, a, b, c, d, e, round - add WK(\round), \e - \F \b, \c, \d # t1 = F(b, c, d); - W_PRECALC (\round + W_PRECALC_AHEAD) - rol $30, \b - add T1, \e - add WK(\round + 1), \d - - \F \a, \b, \c - W_PRECALC (\round + W_PRECALC_AHEAD + 1) - rol $5, \a - add \a, \e - add T1, \d - ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) - - mov \e, T1 - SWAP_REG_NAMES \e, T1 - - rol $5, T1 - add T1, \d - - # write: \a, \b - # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c -.endm - -.macro W_PRECALC r - .set i, \r - - .if (i < 20) - .set K_XMM, 0 - .elseif (i < 40) - .set K_XMM, 16 - .elseif (i < 60) - .set K_XMM, 32 - .elseif (i < 80) - .set K_XMM, 48 - .endif - - .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) - .set i, ((\r) % 80) # pre-compute for the next iteration - .if (i == 0) - W_PRECALC_RESET - .endif - W_PRECALC_00_15 - .elseif (i<32) - W_PRECALC_16_31 - .elseif (i < 80) // rounds 32-79 - W_PRECALC_32_79 - .endif -.endm - -.macro W_PRECALC_RESET - .set W, W0 - .set W_minus_04, W4 - .set W_minus_08, W8 - .set W_minus_12, W12 - .set W_minus_16, W16 - .set W_minus_20, W20 - .set W_minus_24, W24 - .set W_minus_28, W28 - .set W_minus_32, W -.endm - -.macro W_PRECALC_ROTATE - .set W_minus_32, W_minus_28 - .set W_minus_28, W_minus_24 - .set W_minus_24, W_minus_20 - .set W_minus_20, W_minus_16 - .set W_minus_16, W_minus_12 - .set W_minus_12, W_minus_08 - .set W_minus_08, W_minus_04 - .set W_minus_04, W - .set W, W_minus_32 -.endm - -.macro W_PRECALC_SSSE3 - -.macro W_PRECALC_00_15 - W_PRECALC_00_15_SSSE3 -.endm -.macro W_PRECALC_16_31 - W_PRECALC_16_31_SSSE3 -.endm -.macro W_PRECALC_32_79 - W_PRECALC_32_79_SSSE3 -.endm - -/* message scheduling pre-compute for rounds 0-15 */ -.macro W_PRECALC_00_15_SSSE3 - .if ((i & 3) == 0) - movdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - pshufb XMM_SHUFB_BSWAP, W_TMP1 - movdqa W_TMP1, W - .elseif ((i & 3) == 2) - paddd (K_BASE), W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 16-31 - * - * - calculating last 32 w[i] values in 8 XMM registers - * - pre-calculate K+w[i] values and store to mem, for later load by ALU add - * instruction - * - * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] - * dependency, but improves for 32-79 - */ -.macro W_PRECALC_16_31_SSSE3 - # blended scheduling of vector and scalar instruction streams, one 4-wide - # vector iteration / 4 scalar rounds - .if ((i & 3) == 0) - movdqa W_minus_12, W - palignr $8, W_minus_16, W # w[i-14] - movdqa W_minus_04, W_TMP1 - psrldq $4, W_TMP1 # w[i-3] - pxor W_minus_08, W - .elseif ((i & 3) == 1) - pxor W_minus_16, W_TMP1 - pxor W_TMP1, W - movdqa W, W_TMP2 - movdqa W, W_TMP1 - pslldq $12, W_TMP2 - .elseif ((i & 3) == 2) - psrld $31, W - pslld $1, W_TMP1 - por W, W_TMP1 - movdqa W_TMP2, W - psrld $30, W_TMP2 - pslld $2, W - .elseif ((i & 3) == 3) - pxor W, W_TMP1 - pxor W_TMP2, W_TMP1 - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 32-79 - * - * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken - */ -.macro W_PRECALC_32_79_SSSE3 - .if ((i & 3) == 0) - movdqa W_minus_04, W_TMP1 - pxor W_minus_28, W # W is W_minus_32 before xor - palignr $8, W_minus_08, W_TMP1 - .elseif ((i & 3) == 1) - pxor W_minus_16, W - pxor W_TMP1, W - movdqa W, W_TMP1 - .elseif ((i & 3) == 2) - psrld $30, W - pslld $2, W_TMP1 - por W, W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_SSSE3 - - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.section .rodata -.align 16 - -K_XMM_AR: - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - - -.section .text - -W_PRECALC_SSSE3 -.macro xmm_mov a, b - movdqu \a,\b -.endm - -/* - * SSSE3 optimized implementation: - * - * extern "C" void sha1_transform_ssse3(struct sha1_state *state, - * const u8 *data, int blocks); - * - * Note that struct sha1_state is assumed to begin with u32 state[5]. - */ -SHA1_VECTOR_ASM sha1_transform_ssse3 - -.macro W_PRECALC_AVX - -.purgem W_PRECALC_00_15 -.macro W_PRECALC_00_15 - W_PRECALC_00_15_AVX -.endm -.purgem W_PRECALC_16_31 -.macro W_PRECALC_16_31 - W_PRECALC_16_31_AVX -.endm -.purgem W_PRECALC_32_79 -.macro W_PRECALC_32_79 - W_PRECALC_32_79_AVX -.endm - -.macro W_PRECALC_00_15_AVX - .if ((i & 3) == 0) - vmovdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - vpshufb XMM_SHUFB_BSWAP, W_TMP1, W - .elseif ((i & 3) == 2) - vpaddd (K_BASE), W, W_TMP1 - .elseif ((i & 3) == 3) - vmovdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_16_31_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] - vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] - vpxor W_minus_08, W, W - vpxor W_minus_16, W_TMP1, W_TMP1 - .elseif ((i & 3) == 1) - vpxor W_TMP1, W, W - vpslldq $12, W, W_TMP2 - vpslld $1, W, W_TMP1 - .elseif ((i & 3) == 2) - vpsrld $31, W, W - vpor W, W_TMP1, W_TMP1 - vpslld $2, W_TMP2, W - vpsrld $30, W_TMP2, W_TMP2 - .elseif ((i & 3) == 3) - vpxor W, W_TMP1, W_TMP1 - vpxor W_TMP2, W_TMP1, W - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_32_79_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_08, W_minus_04, W_TMP1 - vpxor W_minus_28, W, W # W is W_minus_32 before xor - .elseif ((i & 3) == 1) - vpxor W_minus_16, W_TMP1, W_TMP1 - vpxor W_TMP1, W, W - .elseif ((i & 3) == 2) - vpslld $2, W, W_TMP1 - vpsrld $30, W, W - vpor W, W_TMP1, W - .elseif ((i & 3) == 3) - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_AVX - -W_PRECALC_AVX -.purgem xmm_mov -.macro xmm_mov a, b - vmovdqu \a,\b -.endm - - -/* AVX optimized implementation: - * extern "C" void sha1_transform_avx(struct sha1_state *state, - * const u8 *data, int blocks); - */ -SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c deleted file mode 100644 index 826579a7473c..000000000000 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * Glue code for the SHA1 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha1_generic.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * Copyright (c) Mathias Krause <minipli@googlemail.com> - * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <asm/cpu_device_id.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> - -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) -{ - int remain; - - /* - * Make sure struct sha1_state begins directly with the SHA1 - * 160-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - kernel_fpu_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return remain; -} - -static inline int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, - sha1_block_fn *sha1_xform) -{ - kernel_fpu_begin(); - sha1_base_do_finup(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return sha1_base_finish(desc, out); -} - -asmlinkage void sha1_transform_ssse3(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_ssse3); -} - -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_ssse3); -} - -static struct shash_alg sha1_ssse3_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ssse3_update, - .finup = sha1_ssse3_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shash(&sha1_ssse3_alg); - return 0; -} - -static void unregister_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shash(&sha1_ssse3_alg); -} - -asmlinkage void sha1_transform_avx(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_avx); -} - -static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_avx); -} - -static struct shash_alg sha1_avx_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx_update, - .finup = sha1_avx_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha1_avx(void) -{ - if (avx_usable()) - return crypto_register_shash(&sha1_avx_alg); - return 0; -} - -static void unregister_sha1_avx(void) -{ - if (avx_usable()) - crypto_unregister_shash(&sha1_avx_alg); -} - -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks); - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) - && boot_cpu_has(X86_FEATURE_BMI1) - && boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static inline void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) -{ - /* Select the optimal transform based on data block size */ - if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(state, data, blocks); - else - sha1_transform_avx(state, data, blocks); -} - -static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2); -} - -static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); -} - -static struct shash_alg sha1_avx2_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx2_update, - .finup = sha1_avx2_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shash(&sha1_avx2_alg); - return 0; -} - -static void unregister_sha1_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shash(&sha1_avx2_alg); -} - -asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, - int rounds); - -static int sha1_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_ni_transform); -} - -static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_ni_transform); -} - -static struct shash_alg sha1_ni_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ni_update, - .finup = sha1_ni_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ni", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shash(&sha1_ni_alg); - return 0; -} - -static void unregister_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shash(&sha1_ni_alg); -} - -static int __init sha1_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha1_ssse3()) - goto fail; - - if (register_sha1_avx()) { - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_avx2()) { - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_ni()) { - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha1_ssse3_mod_fini(void) -{ - unregister_sha1_ni(); - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); -} - -module_init(sha1_ssse3_mod_init); -module_exit(sha1_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-ssse3"); -MODULE_ALIAS_CRYPTO("sha1-avx"); -MODULE_ALIAS_CRYPTO("sha1-avx2"); -MODULE_ALIAS_CRYPTO("sha1-ni"); |
