From e49a3eac9207e9575337f70feeb29430f6f16bb7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Jun 2025 11:48:14 -0700 Subject: lib/crypto: Explicitly include Fix build warnings with W=1 that started appearing after commit a934a57a42f6 ("scripts/misc-check: check missing #include when W=1"). While at it, also sort the include lists alphabetically. (Keep asm/irqflags.h last, as otherwise it doesn't build on alpha.) This handles all of lib/crypto/, but not arch/*/lib/crypto/. The exports in arch/*/lib/crypto/ will go away when the code is properly integrated into lib/crypto/ as planned. Link: https://lore.kernel.org/r/20250613184814.50173-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/aes.c | 1 + lib/crypto/aescfb.c | 7 +++---- lib/crypto/aesgcm.c | 5 ++--- lib/crypto/arc4.c | 1 + lib/crypto/blake2s-generic.c | 9 +++++---- lib/crypto/blake2s.c | 9 +++++---- lib/crypto/chacha.c | 6 +++--- lib/crypto/chacha20poly1305.c | 8 ++++---- lib/crypto/curve25519-generic.c | 1 + lib/crypto/des.c | 7 +++---- lib/crypto/gf128mul.c | 1 + lib/crypto/libchacha.c | 7 +++---- lib/crypto/memneq.c | 3 ++- lib/crypto/mpi/mpi-add.c | 2 ++ lib/crypto/mpi/mpi-bit.c | 2 ++ lib/crypto/mpi/mpi-cmp.c | 2 ++ lib/crypto/mpi/mpi-mul.c | 2 ++ lib/crypto/mpi/mpi-pow.c | 2 ++ lib/crypto/mpi/mpi-sub-ui.c | 2 ++ lib/crypto/mpi/mpicoder.c | 3 ++- lib/crypto/mpi/mpiutil.c | 2 ++ lib/crypto/poly1305-donna32.c | 3 ++- lib/crypto/poly1305-donna64.c | 3 ++- lib/crypto/poly1305-generic.c | 1 + lib/crypto/poly1305.c | 1 + lib/crypto/sha1.c | 6 +++--- lib/crypto/sha256-generic.c | 1 + lib/crypto/sha256.c | 1 + lib/crypto/sm3.c | 1 + lib/crypto/utils.c | 3 ++- 30 files changed, 64 insertions(+), 38 deletions(-) diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c index eafe14d021f5..b57fda3460f1 100644 --- a/lib/crypto/aes.c +++ b/lib/crypto/aes.c @@ -5,6 +5,7 @@ #include #include +#include #include #include diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c index 2f09ae92ffa0..0f294c8cbf3c 100644 --- a/lib/crypto/aescfb.c +++ b/lib/crypto/aescfb.c @@ -5,11 +5,10 @@ * Copyright 2023 Google LLC */ -#include - -#include #include - +#include +#include +#include #include static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst, diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c index faa4dee9bb1b..ac0b2fcfd606 100644 --- a/lib/crypto/aesgcm.c +++ b/lib/crypto/aesgcm.c @@ -5,12 +5,11 @@ * Copyright 2022 Google LLC */ -#include - #include #include #include - +#include +#include #include static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst, diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c index 838812d18216..4e950e1e66d0 100644 --- a/lib/crypto/arc4.c +++ b/lib/crypto/arc4.c @@ -8,6 +8,7 @@ */ #include +#include #include int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c index 09682136b57c..9828176a2efe 100644 --- a/lib/crypto/blake2s-generic.c +++ b/lib/crypto/blake2s-generic.c @@ -9,11 +9,12 @@ */ #include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #include static const u8 blake2s_sigma[10][16] = { diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index b0f9a678300b..f6ec68c3dcda 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -9,12 +9,13 @@ */ #include -#include -#include +#include +#include +#include #include #include -#include -#include +#include +#include static inline void blake2s_set_lastblock(struct blake2s_state *state) { diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index ced87dd31a97..77f68de71066 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -5,13 +5,13 @@ * Copyright (C) 2015 Martin Willi */ +#include +#include #include -#include #include -#include +#include #include #include -#include static void chacha_permute(struct chacha_state *state, int nrounds) { diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c index e29eed49a5a1..0b49d6aedefd 100644 --- a/lib/crypto/chacha20poly1305.c +++ b/lib/crypto/chacha20poly1305.c @@ -7,16 +7,16 @@ * Information: https://tools.ietf.org/html/rfc8439 */ -#include #include +#include #include #include - -#include -#include +#include #include +#include #include #include +#include static void chacha_load_key(u32 *k, const u8 *in) { diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c index de7c99172fa2..f8aa70c9f559 100644 --- a/lib/crypto/curve25519-generic.c +++ b/lib/crypto/curve25519-generic.c @@ -10,6 +10,7 @@ */ #include +#include #include const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; diff --git a/lib/crypto/des.c b/lib/crypto/des.c index d3423b34a8e9..a906070136dc 100644 --- a/lib/crypto/des.c +++ b/lib/crypto/des.c @@ -7,21 +7,20 @@ * Copyright (c) 2005 Dag Arne Osvik */ +#include +#include #include #include #include #include +#include #include #include #include #include #include - #include -#include -#include - #define ROL(x, r) ((x) = rol32((x), (r))) #define ROR(x, r) ((x) = ror32((x), (r))) diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c index fbe72cb3453a..2a34590fe3f1 100644 --- a/lib/crypto/gf128mul.c +++ b/lib/crypto/gf128mul.c @@ -49,6 +49,7 @@ */ #include +#include #include #include #include diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c index ebcca381e248..26862ad90a96 100644 --- a/lib/crypto/libchacha.c +++ b/lib/crypto/libchacha.c @@ -5,12 +5,11 @@ * Copyright (C) 2015 Martin Willi */ -#include -#include -#include - #include // for crypto_xor_cpy #include +#include +#include +#include void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c index a2afd10349c9..44daacb8cb51 100644 --- a/lib/crypto/memneq.c +++ b/lib/crypto/memneq.c @@ -59,9 +59,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include #include +#include /* Generic path for arbitrary size */ static inline unsigned long diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c index 3015140d4860..c0375c1672fa 100644 --- a/lib/crypto/mpi/mpi-add.c +++ b/lib/crypto/mpi/mpi-add.c @@ -11,6 +11,8 @@ * to avoid revealing of sensitive data due to paging etc. */ +#include + #include "mpi-internal.h" int mpi_add(MPI w, MPI u, MPI v) diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 934d81311360..b3d0e7ddbc03 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include + #include "mpi-internal.h" #include "longlong.h" diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c index ceaebe181cd7..b42929296bce 100644 --- a/lib/crypto/mpi/mpi-cmp.c +++ b/lib/crypto/mpi/mpi-cmp.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include + #include "mpi-internal.h" int mpi_cmp_ui(MPI u, unsigned long v) diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c index 7e6ff1ce3e9b..d79f186ad90b 100644 --- a/lib/crypto/mpi/mpi-mul.c +++ b/lib/crypto/mpi/mpi-mul.c @@ -11,6 +11,8 @@ * to avoid revealing of sensitive data due to paging etc. */ +#include + #include "mpi-internal.h" int mpi_mul(MPI w, MPI u, MPI v) diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c index 67fbd4c2503d..9e695a3bda3a 100644 --- a/lib/crypto/mpi/mpi-pow.c +++ b/lib/crypto/mpi/mpi-pow.c @@ -13,8 +13,10 @@ * however I decided to publish this code under the plain GPL. */ +#include #include #include + #include "mpi-internal.h" #include "longlong.h" diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c index b41b082b5f3e..0edcdbd24833 100644 --- a/lib/crypto/mpi/mpi-sub-ui.c +++ b/lib/crypto/mpi/mpi-sub-ui.c @@ -32,6 +32,8 @@ * see https://www.gnu.org/licenses/. */ +#include + #include "mpi-internal.h" int mpi_sub_ui(MPI w, MPI u, unsigned long vval) diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c index dde01030807d..47f6939599b3 100644 --- a/lib/crypto/mpi/mpicoder.c +++ b/lib/crypto/mpi/mpicoder.c @@ -19,8 +19,9 @@ */ #include -#include #include +#include +#include #include #include #include "mpi-internal.h" diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c index 979ece5a81d2..7f2db830f404 100644 --- a/lib/crypto/mpi/mpiutil.c +++ b/lib/crypto/mpi/mpiutil.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include + #include "mpi-internal.h" /**************** diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c index 0a4a2d99e365..b66131b3f6d4 100644 --- a/lib/crypto/poly1305-donna32.c +++ b/lib/crypto/poly1305-donna32.c @@ -6,9 +6,10 @@ * public domain. */ +#include +#include #include #include -#include void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c index 530287531b2e..8a72a5a84944 100644 --- a/lib/crypto/poly1305-donna64.c +++ b/lib/crypto/poly1305-donna64.c @@ -6,9 +6,10 @@ * public domain. */ +#include +#include #include #include -#include void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c index a73f700fa1fb..71a16c5c538b 100644 --- a/lib/crypto/poly1305-generic.c +++ b/lib/crypto/poly1305-generic.c @@ -8,6 +8,7 @@ */ #include +#include #include #include diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index 5f2f2af3b59f..a6dc182b6c22 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index ebb60519ae93..6d809c3088be 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -6,12 +6,12 @@ * This was based on the git SHA1 implementation. */ -#include +#include +#include #include +#include #include -#include #include -#include #include /* diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c index a16ad4f25ebb..2968d95d0403 100644 --- a/lib/crypto/sha256-generic.c +++ b/lib/crypto/sha256-generic.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 107e5162507a..6bfa4ae8dfb5 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c index efff0e267d84..c6b9ad8a3ac6 100644 --- a/lib/crypto/sm3.c +++ b/lib/crypto/sm3.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c index 87da2a6dd161..dec381d5e906 100644 --- a/lib/crypto/utils.c +++ b/lib/crypto/utils.c @@ -5,9 +5,10 @@ * Copyright (c) 2006 Herbert Xu */ -#include #include +#include #include +#include /* * XOR @len bytes from @src1 and @src2 together, writing the result to @dst -- cgit v1.2.3 From e0fca177556c4419819876ac5a947b0844115d56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:05 -0700 Subject: crypto: sha512 - Rename conflicting symbols Rename existing functions and structs in architecture-optimized SHA-512 code that had names conflicting with the upcoming library interface which will be added to : sha384_init, sha512_init, sha512_update, sha384, and sha512. Note: all affected code will be superseded by later commits that migrate the arch-optimized SHA-512 code into the library. This commit simply keeps the kernel building for the initial introduction of the library. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/sha512-glue.c | 8 ++++---- arch/s390/crypto/sha512_s390.c | 8 ++++---- arch/sparc/crypto/sha512_glue.c | 14 +++++++------- arch/x86/crypto/sha512_ssse3_glue.c | 10 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 15aa9d8b7b2c..a78e184c100f 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -27,8 +27,8 @@ static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src, sha512_blocks_arch(sst->state, src, blocks); } -static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_update_arm64(struct shash_desc *desc, const u8 *data, + unsigned int len) { return sha512_base_do_update_blocks(desc, data, len, sha512_arm64_transform); @@ -44,7 +44,7 @@ static int sha512_finup(struct shash_desc *desc, const u8 *data, static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, - .update = sha512_update, + .update = sha512_update_arm64, .finup = sha512_finup, .descsize = SHA512_STATE_SIZE, .base.cra_name = "sha512", @@ -57,7 +57,7 @@ static struct shash_alg algs[] = { { }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, - .update = sha512_update, + .update = sha512_update_arm64, .finup = sha512_finup, .descsize = SHA512_STATE_SIZE, .base.cra_name = "sha384", diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index 33711a29618c..e8bb172dbed7 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -17,7 +17,7 @@ #include "sha.h" -static int sha512_init(struct shash_desc *desc) +static int sha512_init_s390(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); @@ -62,7 +62,7 @@ static int sha512_import(struct shash_desc *desc, const void *in) static struct shash_alg sha512_alg = { .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_init, + .init = sha512_init_s390, .update = s390_sha_update_blocks, .finup = s390_sha_finup, .export = sha512_export, @@ -82,7 +82,7 @@ static struct shash_alg sha512_alg = { MODULE_ALIAS_CRYPTO("sha512"); -static int sha384_init(struct shash_desc *desc) +static int sha384_init_s390(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); @@ -103,7 +103,7 @@ static int sha384_init(struct shash_desc *desc) static struct shash_alg sha384_alg = { .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_init, + .init = sha384_init_s390, .update = s390_sha_update_blocks, .finup = s390_sha_finup, .export = sha512_export, diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c index 47b9277b6877..fb81c3290c8c 100644 --- a/arch/sparc/crypto/sha512_glue.c +++ b/arch/sparc/crypto/sha512_glue.c @@ -40,7 +40,7 @@ static int sha512_sparc64_finup(struct shash_desc *desc, const u8 *src, return sha512_base_finish(desc, out); } -static struct shash_alg sha512 = { +static struct shash_alg sha512_alg = { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_sparc64_update, @@ -55,7 +55,7 @@ static struct shash_alg sha512 = { } }; -static struct shash_alg sha384 = { +static struct shash_alg sha384_alg = { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_sparc64_update, @@ -87,13 +87,13 @@ static bool __init sparc64_has_sha512_opcode(void) static int __init sha512_sparc64_mod_init(void) { if (sparc64_has_sha512_opcode()) { - int ret = crypto_register_shash(&sha384); + int ret = crypto_register_shash(&sha384_alg); if (ret < 0) return ret; - ret = crypto_register_shash(&sha512); + ret = crypto_register_shash(&sha512_alg); if (ret < 0) { - crypto_unregister_shash(&sha384); + crypto_unregister_shash(&sha384_alg); return ret; } @@ -106,8 +106,8 @@ static int __init sha512_sparc64_mod_init(void) static void __exit sha512_sparc64_mod_fini(void) { - crypto_unregister_shash(&sha384); - crypto_unregister_shash(&sha512); + crypto_unregister_shash(&sha384_alg); + crypto_unregister_shash(&sha512_alg); } module_init(sha512_sparc64_mod_init); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 067684c54395..97744b7d2381 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -38,8 +38,8 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); -static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha512_block_fn *sha512_xform) +static int sha512_update_x86(struct shash_desc *desc, const u8 *data, + unsigned int len, sha512_block_fn *sha512_xform) { int remain; @@ -69,7 +69,7 @@ static int sha512_finup(struct shash_desc *desc, const u8 *data, static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha512_update(desc, data, len, sha512_transform_ssse3); + return sha512_update_x86(desc, data, len, sha512_transform_ssse3); } static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, @@ -141,7 +141,7 @@ static bool avx_usable(void) static int sha512_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha512_update(desc, data, len, sha512_transform_avx); + return sha512_update_x86(desc, data, len, sha512_transform_avx); } static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, @@ -203,7 +203,7 @@ asmlinkage void sha512_transform_rorx(struct sha512_state *state, static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha512_update(desc, data, len, sha512_transform_rorx); + return sha512_update_x86(desc, data, len, sha512_transform_rorx); } static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, -- cgit v1.2.3 From b693c703accb08cbd52f0b94d810d6abbca3bfb9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:06 -0700 Subject: lib/crypto: sha512: Add support for SHA-384 and SHA-512 Add basic support for SHA-384 and SHA-512 to lib/crypto/. Various in-kernel users will be able to use this instead of the old-school crypto API, which is harder to use and has more overhead. The basic support added by this commit consists of the API and its documentation, backed by a C implementation of the algorithms. sha512_block_generic() is derived from crypto/sha512_generic.c. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 128 ++++++++++++++++++++++++ lib/crypto/Kconfig | 10 ++ lib/crypto/Makefile | 6 ++ lib/crypto/sha512.c | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 409 insertions(+) create mode 100644 lib/crypto/sha512.c diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 4912572578dc..f2a6e84b2840 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -129,4 +129,132 @@ static inline void sha224_init(struct sha256_state *sctx) /* Simply use sha256_update as it is equivalent to sha224_update. */ void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]); +/* State for the SHA-512 (and SHA-384) compression function */ +struct sha512_block_state { + u64 h[8]; +}; + +/* + * Context structure, shared by SHA-384 and SHA-512. The sha384_ctx and + * sha512_ctx structs wrap this one so that the API has proper typing and + * doesn't allow mixing the SHA-384 and SHA-512 functions arbitrarily. + */ +struct __sha512_ctx { + struct sha512_block_state state; + u64 bytecount_lo; + u64 bytecount_hi; + u8 buf[SHA512_BLOCK_SIZE] __aligned(__alignof__(__be64)); +}; +void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len); + +/** + * struct sha384_ctx - Context for hashing a message with SHA-384 + * @ctx: private + */ +struct sha384_ctx { + struct __sha512_ctx ctx; +}; + +/** + * sha384_init() - Initialize a SHA-384 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider sha384() instead. + * + * Context: Any context. + */ +void sha384_init(struct sha384_ctx *ctx); + +/** + * sha384_update() - Update a SHA-384 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void sha384_update(struct sha384_ctx *ctx, + const u8 *data, size_t len) +{ + __sha512_update(&ctx->ctx, data, len); +} + +/** + * sha384_final() - Finish computing a SHA-384 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting SHA-384 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); + +/** + * sha384() - Compute SHA-384 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting SHA-384 message digest + * + * Context: Any context. + */ +void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]); + +/** + * struct sha512_ctx - Context for hashing a message with SHA-512 + * @ctx: private + */ +struct sha512_ctx { + struct __sha512_ctx ctx; +}; + +/** + * sha512_init() - Initialize a SHA-512 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider sha512() instead. + * + * Context: Any context. + */ +void sha512_init(struct sha512_ctx *ctx); + +/** + * sha512_update() - Update a SHA-512 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void sha512_update(struct sha512_ctx *ctx, + const u8 *data, size_t len) +{ + __sha512_update(&ctx->ctx, data, len); +} + +/** + * sha512_final() - Finish computing a SHA-512 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting SHA-512 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); + +/** + * sha512() - Compute SHA-512 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting SHA-512 message digest + * + * Context: Any context. + */ +void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]); + #endif /* _CRYPTO_SHA2_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 1ec1466108cc..2d295c0e0f79 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -167,6 +167,16 @@ config CRYPTO_LIB_SHA256_GENERIC for SIMD implementations. If no arch specific implementation is enabled, this implementation serves the users of CRYPTO_LIB_SHA256. +config CRYPTO_LIB_SHA512 + tristate + help + The SHA-384 and SHA-512 library functions. Select this if your module + uses any of these functions from . + +config CRYPTO_LIB_SHA512_ARCH + bool + depends on CRYPTO_LIB_SHA512 && !UML + config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index b0c0f8aea269..f6b6f370451e 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -64,6 +64,12 @@ libsha256-y := sha256.o obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC) += libsha256-generic.o libsha256-generic-y := sha256-generic.o +obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o +libsha512-y := sha512.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y) +CFLAGS_sha512.o += -I$(src)/$(SRCARCH) +endif # CONFIG_CRYPTO_LIB_SHA512_ARCH + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c new file mode 100644 index 000000000000..536b71481b1c --- /dev/null +++ b/lib/crypto/sha512.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-384 and SHA-512 library functions + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2003 Kyle McMartin + * Copyright 2025 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include + +static const struct sha512_block_state sha384_iv = { + .h = { + SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3, + SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7, + }, +}; + +static const struct sha512_block_state sha512_iv = { + .h = { + SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3, + SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7, + }, +}; + +static const u64 sha512_K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, +}; + +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y)))) +#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39)) +#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41)) +#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7)) +#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6)) + +static void sha512_block_generic(struct sha512_block_state *state, + const u8 *data) +{ + u64 a = state->h[0]; + u64 b = state->h[1]; + u64 c = state->h[2]; + u64 d = state->h[3]; + u64 e = state->h[4]; + u64 f = state->h[5]; + u64 g = state->h[6]; + u64 h = state->h[7]; + u64 t1, t2; + u64 W[16]; + + for (int j = 0; j < 16; j++) + W[j] = get_unaligned_be64(data + j * sizeof(u64)); + + for (int i = 0; i < 80; i += 8) { + if ((i & 15) == 0 && i != 0) { + for (int j = 0; j < 16; j++) { + W[j & 15] += s1(W[(j - 2) & 15]) + + W[(j - 7) & 15] + + s0(W[(j - 15) & 15]); + } + } + t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i & 15)]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + } + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; + state->h[4] += e; + state->h[5] += f; + state->h[6] += g; + state->h[7] += h; +} + +static void __maybe_unused +sha512_blocks_generic(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + sha512_block_generic(state, data); + data += SHA512_BLOCK_SIZE; + } while (--nblocks); +} + +#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH +#include "sha512.h" /* $(SRCARCH)/sha512.h */ +#else +#define sha512_blocks sha512_blocks_generic +#endif + +static void __sha512_init(struct __sha512_ctx *ctx, + const struct sha512_block_state *iv, + u64 initial_bytecount) +{ + ctx->state = *iv; + ctx->bytecount_lo = initial_bytecount; + ctx->bytecount_hi = 0; +} + +void sha384_init(struct sha384_ctx *ctx) +{ + __sha512_init(&ctx->ctx, &sha384_iv, 0); +} +EXPORT_SYMBOL_GPL(sha384_init); + +void sha512_init(struct sha512_ctx *ctx) +{ + __sha512_init(&ctx->ctx, &sha512_iv, 0); +} +EXPORT_SYMBOL_GPL(sha512_init); + +void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE; + + if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo)) + ctx->bytecount_hi++; + + if (partial + len >= SHA512_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA512_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha512_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA512_BLOCK_SIZE; + len %= SHA512_BLOCK_SIZE; + + if (nblocks) { + sha512_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA512_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(__sha512_update); + +static void __sha512_final(struct __sha512_ctx *ctx, + u8 *out, size_t digest_size) +{ + u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61); + u64 bitcount_lo = ctx->bytecount_lo << 3; + size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA512_BLOCK_SIZE - 16) { + memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial); + sha512_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial); + *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi); + *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo); + sha512_blocks(&ctx->state, ctx->buf, 1); + + for (size_t i = 0; i < digest_size; i += 8) + put_unaligned_be64(ctx->state.h[i / 8], out + i); +} + +void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]) +{ + __sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha384_final); + +void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]) +{ + __sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha512_final); + +void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]) +{ + struct sha384_ctx ctx; + + sha384_init(&ctx); + sha384_update(&ctx, data, len); + sha384_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha384); + +void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]) +{ + struct sha512_ctx ctx; + + sha512_init(&ctx); + sha512_update(&ctx, data, len); + sha512_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha512); + +#ifdef sha512_mod_init_arch +static int __init sha512_mod_init(void) +{ + sha512_mod_init_arch(); + return 0; +} +subsys_initcall(sha512_mod_init); + +static void __exit sha512_mod_exit(void) +{ +} +module_exit(sha512_mod_exit); +#endif + +MODULE_DESCRIPTION("SHA-384 and SHA-512 library functions"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 23e8b4371dbd5907d633262f36903144a378a114 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:07 -0700 Subject: lib/crypto: sha512: Add HMAC-SHA384 and HMAC-SHA512 support Since HMAC support is commonly needed and is fairly simple, include it as a first-class citizen of the SHA-512 library. The API supports both incremental and one-shot computation, and either preparing the key ahead of time or just using a raw key. The implementation is much more streamlined than crypto/hmac.c. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/crypto/Kconfig | 5 +- lib/crypto/sha512.c | 141 +++++++++++++++++++++++++++++++- 3 files changed, 364 insertions(+), 4 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index f2a6e84b2840..296ce9d468bf 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -147,6 +147,22 @@ struct __sha512_ctx { }; void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len); +/* + * HMAC key and message context structs, shared by HMAC-SHA384 and HMAC-SHA512. + * The hmac_sha384_* and hmac_sha512_* structs wrap this one so that the API has + * proper typing and doesn't allow mixing the functions arbitrarily. + */ +struct __hmac_sha512_key { + struct sha512_block_state istate; + struct sha512_block_state ostate; +}; +struct __hmac_sha512_ctx { + struct __sha512_ctx sha_ctx; + struct sha512_block_state ostate; +}; +void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx, + const struct __hmac_sha512_key *key); + /** * struct sha384_ctx - Context for hashing a message with SHA-384 * @ctx: private @@ -202,6 +218,109 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); */ void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]); +/** + * struct hmac_sha384_key - Prepared key for HMAC-SHA384 + * @key: private + */ +struct hmac_sha384_key { + struct __hmac_sha512_key key; +}; + +/** + * struct hmac_sha384_ctx - Context for computing HMAC-SHA384 of a message + * @ctx: private + */ +struct hmac_sha384_ctx { + struct __hmac_sha512_ctx ctx; +}; + +/** + * hmac_sha384_preparekey() - Prepare a key for HMAC-SHA384 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-SHA384 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_sha384_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_sha384_preparekey(struct hmac_sha384_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha384_init() - Initialize an HMAC-SHA384 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_sha384() instead. + * + * Context: Any context. + */ +static inline void hmac_sha384_init(struct hmac_sha384_ctx *ctx, + const struct hmac_sha384_key *key) +{ + __hmac_sha512_init(&ctx->ctx, &key->key); +} + +/** + * hmac_sha384_update() - Update an HMAC-SHA384 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx, + const u8 *data, size_t data_len) +{ + __sha512_update(&ctx->ctx.sha_ctx, data, data_len); +} + +/** + * hmac_sha384_final() - Finish computing an HMAC-SHA384 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-SHA384 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); + +/** + * hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA384 value + * + * If you're using the key only once, consider using hmac_sha384_usingrawkey(). + * + * Context: Any context. + */ +void hmac_sha384(const struct hmac_sha384_key *key, + const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]); + +/** + * hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key + * @raw_key: the raw HMAC-SHA384 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA384 value + * + * If you're using the key multiple times, prefer to use + * hmac_sha384_preparekey() followed by multiple calls to hmac_sha384() instead. + * + * Context: Any context. + */ +void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA384_DIGEST_SIZE]); + /** * struct sha512_ctx - Context for hashing a message with SHA-512 * @ctx: private @@ -257,4 +376,107 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); */ void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]); +/** + * struct hmac_sha512_key - Prepared key for HMAC-SHA512 + * @key: private + */ +struct hmac_sha512_key { + struct __hmac_sha512_key key; +}; + +/** + * struct hmac_sha512_ctx - Context for computing HMAC-SHA512 of a message + * @ctx: private + */ +struct hmac_sha512_ctx { + struct __hmac_sha512_ctx ctx; +}; + +/** + * hmac_sha512_preparekey() - Prepare a key for HMAC-SHA512 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-SHA512 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_sha512_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_sha512_preparekey(struct hmac_sha512_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha512_init() - Initialize an HMAC-SHA512 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_sha512() instead. + * + * Context: Any context. + */ +static inline void hmac_sha512_init(struct hmac_sha512_ctx *ctx, + const struct hmac_sha512_key *key) +{ + __hmac_sha512_init(&ctx->ctx, &key->key); +} + +/** + * hmac_sha512_update() - Update an HMAC-SHA512 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx, + const u8 *data, size_t data_len) +{ + __sha512_update(&ctx->ctx.sha_ctx, data, data_len); +} + +/** + * hmac_sha512_final() - Finish computing an HMAC-SHA512 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-SHA512 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); + +/** + * hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA512 value + * + * If you're using the key only once, consider using hmac_sha512_usingrawkey(). + * + * Context: Any context. + */ +void hmac_sha512(const struct hmac_sha512_key *key, + const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]); + +/** + * hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key + * @raw_key: the raw HMAC-SHA512 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA512 value + * + * If you're using the key multiple times, prefer to use + * hmac_sha512_preparekey() followed by multiple calls to hmac_sha512() instead. + * + * Context: Any context. + */ +void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA512_DIGEST_SIZE]); + #endif /* _CRYPTO_SHA2_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 2d295c0e0f79..d1bee3787eb3 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -170,8 +170,9 @@ config CRYPTO_LIB_SHA256_GENERIC config CRYPTO_LIB_SHA512 tristate help - The SHA-384 and SHA-512 library functions. Select this if your module - uses any of these functions from . + The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions. + Select this if your module uses any of these functions from + . config CRYPTO_LIB_SHA512_ARCH bool diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c index 536b71481b1c..d514721491ca 100644 --- a/lib/crypto/sha512.c +++ b/lib/crypto/sha512.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * SHA-384 and SHA-512 library functions + * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions * * Copyright (c) Jean-Luc Cooke * Copyright (c) Andrew McDonald @@ -8,6 +8,7 @@ * Copyright 2025 Google LLC */ +#include #include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include static const struct sha512_block_state sha384_iv = { .h = { @@ -247,6 +249,141 @@ void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]) } EXPORT_SYMBOL_GPL(sha512); +static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key, + const u8 *raw_key, size_t raw_key_len, + const struct sha512_block_state *iv) +{ + union { + u8 b[SHA512_BLOCK_SIZE]; + unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) { + if (iv == &sha384_iv) + sha384(raw_key, raw_key_len, derived_key.b); + else + sha512(raw_key, raw_key_len, derived_key.b); + } else { + memcpy(derived_key.b, raw_key, raw_key_len); + } + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + key->istate = *iv; + sha512_blocks(&key->istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + key->ostate = *iv; + sha512_blocks(&key->ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha384_preparekey(struct hmac_sha384_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha384_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha384_preparekey); + +void hmac_sha512_preparekey(struct hmac_sha512_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha512_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha512_preparekey); + +void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx, + const struct __hmac_sha512_key *key) +{ + __sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE); + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(__hmac_sha512_init); + +static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx, + u8 *out, size_t digest_size) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size); + memset(&ctx->sha_ctx.buf[digest_size], 0, + SHA512_BLOCK_SIZE - digest_size); + ctx->sha_ctx.buf[digest_size] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < digest_size; i += 8) + put_unaligned_be64(ctx->ostate.h[i / 8], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +void hmac_sha384_final(struct hmac_sha384_ctx *ctx, + u8 out[SHA384_DIGEST_SIZE]) +{ + __hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha384_final); + +void hmac_sha512_final(struct hmac_sha512_ctx *ctx, + u8 out[SHA512_DIGEST_SIZE]) +{ + __hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha512_final); + +void hmac_sha384(const struct hmac_sha384_key *key, + const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]) +{ + struct hmac_sha384_ctx ctx; + + hmac_sha384_init(&ctx, key); + hmac_sha384_update(&ctx, data, data_len); + hmac_sha384_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha384); + +void hmac_sha512(const struct hmac_sha512_key *key, + const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]) +{ + struct hmac_sha512_ctx ctx; + + hmac_sha512_init(&ctx, key); + hmac_sha512_update(&ctx, data, data_len); + hmac_sha512_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha512); + +void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA384_DIGEST_SIZE]) +{ + struct hmac_sha384_key key; + + hmac_sha384_preparekey(&key, raw_key, raw_key_len); + hmac_sha384(&key, data, data_len, out); + + memzero_explicit(&key, sizeof(key)); +} +EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey); + +void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA512_DIGEST_SIZE]) +{ + struct hmac_sha512_key key; + + hmac_sha512_preparekey(&key, raw_key, raw_key_len); + hmac_sha512(&key, data, data_len, out); + + memzero_explicit(&key, sizeof(key)); +} +EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey); + #ifdef sha512_mod_init_arch static int __init sha512_mod_init(void) { @@ -261,5 +398,5 @@ static void __exit sha512_mod_exit(void) module_exit(sha512_mod_exit); #endif -MODULE_DESCRIPTION("SHA-384 and SHA-512 library functions"); +MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 485deceec03997eabd884d4da89217af73d1ab8d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:08 -0700 Subject: crypto: riscv/sha512 - Stop depending on sha512_generic_block_fn sha512_generic_block_fn() will no longer be available when the SHA-512 support in the old-school crypto API is changed to just wrap the SHA-512 library. Replace the use of sha512_generic_block_fn() in sha512-riscv64-glue.c with temporary code that uses the library's __sha512_update(). This is just a temporary workaround to keep the kernel building and functional at each commit; this code gets superseded when the RISC-V optimized SHA-512 is migrated to lib/crypto/ anyway. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/riscv/crypto/Kconfig | 1 + arch/riscv/crypto/sha512-riscv64-glue.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index cd9b776602f8..53e4e1eacf55 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -31,6 +31,7 @@ config CRYPTO_GHASH_RISCV64 config CRYPTO_SHA512_RISCV64 tristate "Hash functions: SHA-384 and SHA-512" depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_LIB_SHA512 select CRYPTO_SHA512 help SHA-384 and SHA-512 secure hash algorithm (FIPS 180) diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c index 4634fca78ae2..b3dbc71de07b 100644 --- a/arch/riscv/crypto/sha512-riscv64-glue.c +++ b/arch/riscv/crypto/sha512-riscv64-glue.c @@ -38,7 +38,13 @@ static void sha512_block(struct sha512_state *state, const u8 *data, sha512_transform_zvknhb_zvkb(state, data, num_blocks); kernel_vector_end(); } else { - sha512_generic_block_fn(state, data, num_blocks); + struct __sha512_ctx ctx = {}; + + static_assert(sizeof(ctx.state) == sizeof(state->state)); + memcpy(&ctx.state, state->state, sizeof(ctx.state)); + __sha512_update(&ctx, data, + (size_t)num_blocks * SHA512_BLOCK_SIZE); + memcpy(state->state, &ctx.state, sizeof(state->state)); } } -- cgit v1.2.3 From 469acaa12502e05eefd439693361fe4b851a4fd5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:09 -0700 Subject: crypto: sha512 - Replace sha512_generic with wrapper around SHA-512 library Delete crypto/sha512_generic.c, which provided "generic" SHA-384 and SHA-512 crypto_shash algorithms. Replace it with crypto/sha512.c which provides SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 crypto_shash algorithms using the corresponding library functions. This is a prerequisite for migrating all the arch-optimized SHA-512 code (which is almost 3000 lines) to lib/crypto/ rather than duplicating it. Since the replacement crypto_shash algorithms are implemented using the (potentially arch-optimized) library functions, give them cra_driver_names ending with "-lib" rather than "-generic". Update crypto/testmgr.c and one odd driver to take this change in driver name into account. Besides these cases which are accounted for, there are no known cases where the cra_driver_name was being depended on. This change does mean that the abstract partial block handling code in crypto/shash.c, which got added in 6.16, no longer gets used. But that's fine; the library has to implement the partial block handling anyway, and it's better to do it in the library since the block size and other properties of the algorithm are all fixed at compile time there, resulting in more streamlined code. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 4 +- crypto/Makefile | 2 +- crypto/sha512.c | 258 ++++++++++++++++++++++++++++++++++ crypto/sha512_generic.c | 217 ---------------------------- crypto/testmgr.c | 10 ++ drivers/crypto/starfive/jh7110-hash.c | 8 +- include/crypto/sha512_base.h | 3 - 7 files changed, 276 insertions(+), 226 deletions(-) create mode 100644 crypto/sha512.c delete mode 100644 crypto/sha512_generic.c diff --git a/crypto/Kconfig b/crypto/Kconfig index e1cfd0d4cc8f..cb40a9b46972 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1002,8 +1002,10 @@ config CRYPTO_SHA256 config CRYPTO_SHA512 tristate "SHA-384 and SHA-512" select CRYPTO_HASH + select CRYPTO_LIB_SHA512 help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180, ISO/IEC 10118-3) + SHA-384 and SHA-512 secure hash algorithms (FIPS 180, ISO/IEC + 10118-3), including HMAC support. config CRYPTO_SHA3 tristate "SHA-3" diff --git a/crypto/Makefile b/crypto/Makefile index 017df3a2e4bb..271c77462cec 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -78,7 +78,7 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o obj-$(CONFIG_CRYPTO_SHA256) += sha256.o CFLAGS_sha256.o += -DARCH=$(ARCH) -obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o +obj-$(CONFIG_CRYPTO_SHA512) += sha512.o obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o diff --git a/crypto/sha512.c b/crypto/sha512.c new file mode 100644 index 000000000000..d1e5400fe590 --- /dev/null +++ b/crypto/sha512.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2003 Kyle McMartin + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include + +/* SHA-384 */ + +const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = { + 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38, + 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a, + 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43, + 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda, + 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb, + 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b +}; +EXPORT_SYMBOL_GPL(sha384_zero_message_hash); + +#define SHA384_CTX(desc) ((struct sha384_ctx *)shash_desc_ctx(desc)) + +static int crypto_sha384_init(struct shash_desc *desc) +{ + sha384_init(SHA384_CTX(desc)); + return 0; +} + +static int crypto_sha384_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + sha384_update(SHA384_CTX(desc), data, len); + return 0; +} + +static int crypto_sha384_final(struct shash_desc *desc, u8 *out) +{ + sha384_final(SHA384_CTX(desc), out); + return 0; +} + +static int crypto_sha384_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + sha384(data, len, out); + return 0; +} + +/* SHA-512 */ + +const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = { + 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd, + 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07, + 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc, + 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce, + 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0, + 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f, + 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81, + 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e +}; +EXPORT_SYMBOL_GPL(sha512_zero_message_hash); + +#define SHA512_CTX(desc) ((struct sha512_ctx *)shash_desc_ctx(desc)) + +static int crypto_sha512_init(struct shash_desc *desc) +{ + sha512_init(SHA512_CTX(desc)); + return 0; +} + +static int crypto_sha512_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + sha512_update(SHA512_CTX(desc), data, len); + return 0; +} + +static int crypto_sha512_final(struct shash_desc *desc, u8 *out) +{ + sha512_final(SHA512_CTX(desc), out); + return 0; +} + +static int crypto_sha512_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + sha512(data, len, out); + return 0; +} + +/* HMAC-SHA384 */ + +#define HMAC_SHA384_KEY(tfm) ((struct hmac_sha384_key *)crypto_shash_ctx(tfm)) +#define HMAC_SHA384_CTX(desc) ((struct hmac_sha384_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_sha384_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) +{ + hmac_sha384_preparekey(HMAC_SHA384_KEY(tfm), raw_key, keylen); + return 0; +} + +static int crypto_hmac_sha384_init(struct shash_desc *desc) +{ + hmac_sha384_init(HMAC_SHA384_CTX(desc), HMAC_SHA384_KEY(desc->tfm)); + return 0; +} + +static int crypto_hmac_sha384_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + hmac_sha384_update(HMAC_SHA384_CTX(desc), data, len); + return 0; +} + +static int crypto_hmac_sha384_final(struct shash_desc *desc, u8 *out) +{ + hmac_sha384_final(HMAC_SHA384_CTX(desc), out); + return 0; +} + +static int crypto_hmac_sha384_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, + u8 *out) +{ + hmac_sha384(HMAC_SHA384_KEY(desc->tfm), data, len, out); + return 0; +} + +/* HMAC-SHA512 */ + +#define HMAC_SHA512_KEY(tfm) ((struct hmac_sha512_key *)crypto_shash_ctx(tfm)) +#define HMAC_SHA512_CTX(desc) ((struct hmac_sha512_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_sha512_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) +{ + hmac_sha512_preparekey(HMAC_SHA512_KEY(tfm), raw_key, keylen); + return 0; +} + +static int crypto_hmac_sha512_init(struct shash_desc *desc) +{ + hmac_sha512_init(HMAC_SHA512_CTX(desc), HMAC_SHA512_KEY(desc->tfm)); + return 0; +} + +static int crypto_hmac_sha512_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + hmac_sha512_update(HMAC_SHA512_CTX(desc), data, len); + return 0; +} + +static int crypto_hmac_sha512_final(struct shash_desc *desc, u8 *out) +{ + hmac_sha512_final(HMAC_SHA512_CTX(desc), out); + return 0; +} + +static int crypto_hmac_sha512_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, + u8 *out) +{ + hmac_sha512(HMAC_SHA512_KEY(desc->tfm), data, len, out); + return 0; +} + +/* Algorithm definitions */ + +static struct shash_alg algs[] = { + { + .base.cra_name = "sha384", + .base.cra_driver_name = "sha384-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .digestsize = SHA384_DIGEST_SIZE, + .init = crypto_sha384_init, + .update = crypto_sha384_update, + .final = crypto_sha384_final, + .digest = crypto_sha384_digest, + .descsize = sizeof(struct sha384_ctx), + }, + { + .base.cra_name = "sha512", + .base.cra_driver_name = "sha512-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .digestsize = SHA512_DIGEST_SIZE, + .init = crypto_sha512_init, + .update = crypto_sha512_update, + .final = crypto_sha512_final, + .digest = crypto_sha512_digest, + .descsize = sizeof(struct sha512_ctx), + }, + { + .base.cra_name = "hmac(sha384)", + .base.cra_driver_name = "hmac-sha384-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA384_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_sha384_key), + .base.cra_module = THIS_MODULE, + .digestsize = SHA384_DIGEST_SIZE, + .setkey = crypto_hmac_sha384_setkey, + .init = crypto_hmac_sha384_init, + .update = crypto_hmac_sha384_update, + .final = crypto_hmac_sha384_final, + .digest = crypto_hmac_sha384_digest, + .descsize = sizeof(struct hmac_sha384_ctx), + }, + { + .base.cra_name = "hmac(sha512)", + .base.cra_driver_name = "hmac-sha512-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_sha512_key), + .base.cra_module = THIS_MODULE, + .digestsize = SHA512_DIGEST_SIZE, + .setkey = crypto_hmac_sha512_setkey, + .init = crypto_hmac_sha512_init, + .update = crypto_hmac_sha512_update, + .final = crypto_hmac_sha512_final, + .digest = crypto_hmac_sha512_digest, + .descsize = sizeof(struct hmac_sha512_ctx), + }, +}; + +static int __init crypto_sha512_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} +module_init(crypto_sha512_mod_init); + +static void __exit crypto_sha512_mod_exit(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} +module_exit(crypto_sha512_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512"); + +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha384-lib"); +MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha512-lib"); +MODULE_ALIAS_CRYPTO("hmac(sha384)"); +MODULE_ALIAS_CRYPTO("hmac-sha384-lib"); +MODULE_ALIAS_CRYPTO("hmac(sha512)"); +MODULE_ALIAS_CRYPTO("hmac-sha512-lib"); diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c deleted file mode 100644 index 7368173f545e..000000000000 --- a/crypto/sha512_generic.c +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* SHA-512 code by Jean-Luc Cooke - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2003 Kyle McMartin - */ -#include -#include -#include -#include -#include -#include - -const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = { - 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38, - 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a, - 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43, - 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda, - 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb, - 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b -}; -EXPORT_SYMBOL_GPL(sha384_zero_message_hash); - -const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = { - 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd, - 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07, - 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc, - 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce, - 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0, - 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f, - 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81, - 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e -}; -EXPORT_SYMBOL_GPL(sha512_zero_message_hash); - -static inline u64 Ch(u64 x, u64 y, u64 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u64 Maj(u64 x, u64 y, u64 z) -{ - return (x & y) | (z & (x | y)); -} - -static const u64 sha512_K[80] = { - 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, - 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, - 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, - 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, - 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, - 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, - 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, - 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, - 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, - 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, - 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, - 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, - 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, - 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, - 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, - 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, - 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, - 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, - 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, - 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, - 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, - 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, - 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, - 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, - 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, - 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, - 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, -}; - -#define e0(x) (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39)) -#define e1(x) (ror64(x,14) ^ ror64(x,18) ^ ror64(x,41)) -#define s0(x) (ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7)) -#define s1(x) (ror64(x,19) ^ ror64(x,61) ^ (x >> 6)) - -static inline void LOAD_OP(int I, u64 *W, const u8 *input) -{ - W[I] = get_unaligned_be64((__u64 *)input + I); -} - -static inline void BLEND_OP(int I, u64 *W) -{ - W[I & 15] += s1(W[(I-2) & 15]) + W[(I-7) & 15] + s0(W[(I-15) & 15]); -} - -static void -sha512_transform(u64 *state, const u8 *input) -{ - u64 a, b, c, d, e, f, g, h, t1, t2; - - int i; - u64 W[16]; - - /* load the state into our registers */ - a=state[0]; b=state[1]; c=state[2]; d=state[3]; - e=state[4]; f=state[5]; g=state[6]; h=state[7]; - - /* now iterate */ - for (i=0; i<80; i+=8) { - if (!(i & 8)) { - int j; - - if (i < 16) { - /* load the input */ - for (j = 0; j < 16; j++) - LOAD_OP(i + j, W, input); - } else { - for (j = 0; j < 16; j++) { - BLEND_OP(i + j, W); - } - } - } - - t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i ] + W[(i & 15)]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[(i & 15) + 1]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[(i & 15) + 2]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[(i & 15) + 3]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[(i & 15) + 4]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[(i & 15) + 5]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[(i & 15) + 6]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[(i & 15) + 7]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - } - - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; -} - -void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src, - int blocks) -{ - do { - sha512_transform(sst->state, src); - src += SHA512_BLOCK_SIZE; - } while (--blocks); -} -EXPORT_SYMBOL_GPL(sha512_generic_block_fn); - -static int crypto_sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_generic_block_fn); -} - -static int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash) -{ - sha512_base_do_finup(desc, data, len, sha512_generic_block_fn); - return sha512_base_finish(desc, hash); -} - -static struct shash_alg sha512_algs[2] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = crypto_sha512_update, - .finup = crypto_sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = crypto_sha512_update, - .finup = crypto_sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha512_generic_mod_init(void) -{ - return crypto_register_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); -} - -static void __exit sha512_generic_mod_fini(void) -{ - crypto_unregister_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); -} - -module_init(sha512_generic_mod_init); -module_exit(sha512_generic_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms"); - -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha384-generic"); -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha512-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 32f753d6c430..9d8b11ea4af7 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4315,12 +4315,14 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha384),cbc(des))", + .generic_driver = "authenc(hmac-sha384-lib,cbc(des-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha384_des_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha384),cbc(des3_ede))", + .generic_driver = "authenc(hmac-sha384-lib,cbc(des3_ede-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha384_des3_ede_cbc_tv_temp) @@ -4331,6 +4333,7 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha384),cts(cbc(aes)))", + .generic_driver = "authenc(hmac-sha384-lib,cts(cbc(aes-generic)))", .test = alg_test_aead, .suite = { .aead = __VECS(krb5_test_aes256_cts_hmac_sha384_192) @@ -4341,6 +4344,7 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha512),cbc(aes))", + .generic_driver = "authenc(hmac-sha512-lib,cbc(aes-generic))", .fips_allowed = 1, .test = alg_test_aead, .suite = { @@ -4348,12 +4352,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(sha512),cbc(des))", + .generic_driver = "authenc(hmac-sha512-lib,cbc(des-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha512_des_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha512),cbc(des3_ede))", + .generic_driver = "authenc(hmac-sha512-lib,cbc(des3_ede-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha512_des3_ede_cbc_tv_temp) @@ -5157,6 +5163,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(sha384)", + .generic_driver = "hmac-sha384-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5164,6 +5171,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(sha512)", + .generic_driver = "hmac-sha512-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5493,6 +5501,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "sha384", + .generic_driver = "sha384-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5500,6 +5509,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "sha512", + .generic_driver = "sha512-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { diff --git a/drivers/crypto/starfive/jh7110-hash.c b/drivers/crypto/starfive/jh7110-hash.c index 2c60a1047bc3..4abbff07412f 100644 --- a/drivers/crypto/starfive/jh7110-hash.c +++ b/drivers/crypto/starfive/jh7110-hash.c @@ -505,13 +505,13 @@ static int starfive_sha256_init_tfm(struct crypto_ahash *hash) static int starfive_sha384_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "sha384-generic", + return starfive_hash_init_tfm(hash, "sha384-lib", STARFIVE_HASH_SHA384, 0); } static int starfive_sha512_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "sha512-generic", + return starfive_hash_init_tfm(hash, "sha512-lib", STARFIVE_HASH_SHA512, 0); } @@ -535,13 +535,13 @@ static int starfive_hmac_sha256_init_tfm(struct crypto_ahash *hash) static int starfive_hmac_sha384_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "hmac(sha384-generic)", + return starfive_hash_init_tfm(hash, "hmac-sha384-lib", STARFIVE_HASH_SHA384, 1); } static int starfive_hmac_sha512_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "hmac(sha512-generic)", + return starfive_hash_init_tfm(hash, "hmac-sha512-lib", STARFIVE_HASH_SHA512, 1); } diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h index aa814bab442d..d1361b3eb70b 100644 --- a/include/crypto/sha512_base.h +++ b/include/crypto/sha512_base.h @@ -114,7 +114,4 @@ static inline int sha512_base_finish(struct shash_desc *desc, u8 *out) return 0; } -void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src, - int blocks); - #endif /* _CRYPTO_SHA512_BASE_H */ -- cgit v1.2.3 From 4bc7f7b687a2a1506cdc457bc4f6d29a81794a08 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:10 -0700 Subject: crypto: sha512 - Use same state format as legacy drivers Make the export and import functions for the sha384, sha512, hmac(sha384), and hmac(sha512) shash algorithms use the same format as the padlock-sha and nx-sha512 drivers, as required by Herbert. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/sha512.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/crypto/sha512.c b/crypto/sha512.c index d1e5400fe590..fb1c520978ef 100644 --- a/crypto/sha512.c +++ b/crypto/sha512.c @@ -12,6 +12,44 @@ #include #include +/* + * Export and import functions. crypto_shash wants a particular format that + * matches that used by some legacy drivers. It currently is the same as the + * library SHA context, except the value in bytecount_lo must be block-aligned + * and the remainder must be stored in an extra u8 appended to the struct. + */ + +#define SHA512_SHASH_STATE_SIZE 209 +static_assert(offsetof(struct __sha512_ctx, state) == 0); +static_assert(offsetof(struct __sha512_ctx, bytecount_lo) == 64); +static_assert(offsetof(struct __sha512_ctx, bytecount_hi) == 72); +static_assert(offsetof(struct __sha512_ctx, buf) == 80); +static_assert(sizeof(struct __sha512_ctx) + 1 == SHA512_SHASH_STATE_SIZE); + +static int __crypto_sha512_export(const struct __sha512_ctx *ctx0, void *out) +{ + struct __sha512_ctx ctx = *ctx0; + unsigned int partial; + u8 *p = out; + + partial = ctx.bytecount_lo % SHA512_BLOCK_SIZE; + ctx.bytecount_lo -= partial; + memcpy(p, &ctx, sizeof(ctx)); + p += sizeof(ctx); + *p = partial; + return 0; +} + +static int __crypto_sha512_import(struct __sha512_ctx *ctx, const void *in) +{ + const u8 *p = in; + + memcpy(ctx, p, sizeof(*ctx)); + p += sizeof(*ctx); + ctx->bytecount_lo += *p; + return 0; +} + /* SHA-384 */ const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = { @@ -52,6 +90,16 @@ static int crypto_sha384_digest(struct shash_desc *desc, return 0; } +static int crypto_sha384_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha512_export(&SHA384_CTX(desc)->ctx, out); +} + +static int crypto_sha384_import(struct shash_desc *desc, const void *in) +{ + return __crypto_sha512_import(&SHA384_CTX(desc)->ctx, in); +} + /* SHA-512 */ const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = { @@ -94,6 +142,16 @@ static int crypto_sha512_digest(struct shash_desc *desc, return 0; } +static int crypto_sha512_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha512_export(&SHA512_CTX(desc)->ctx, out); +} + +static int crypto_sha512_import(struct shash_desc *desc, const void *in) +{ + return __crypto_sha512_import(&SHA512_CTX(desc)->ctx, in); +} + /* HMAC-SHA384 */ #define HMAC_SHA384_KEY(tfm) ((struct hmac_sha384_key *)crypto_shash_ctx(tfm)) @@ -133,6 +191,19 @@ static int crypto_hmac_sha384_digest(struct shash_desc *desc, return 0; } +static int crypto_hmac_sha384_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha512_export(&HMAC_SHA384_CTX(desc)->ctx.sha_ctx, out); +} + +static int crypto_hmac_sha384_import(struct shash_desc *desc, const void *in) +{ + struct hmac_sha384_ctx *ctx = HMAC_SHA384_CTX(desc); + + ctx->ctx.ostate = HMAC_SHA384_KEY(desc->tfm)->key.ostate; + return __crypto_sha512_import(&ctx->ctx.sha_ctx, in); +} + /* HMAC-SHA512 */ #define HMAC_SHA512_KEY(tfm) ((struct hmac_sha512_key *)crypto_shash_ctx(tfm)) @@ -172,6 +243,19 @@ static int crypto_hmac_sha512_digest(struct shash_desc *desc, return 0; } +static int crypto_hmac_sha512_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha512_export(&HMAC_SHA512_CTX(desc)->ctx.sha_ctx, out); +} + +static int crypto_hmac_sha512_import(struct shash_desc *desc, const void *in) +{ + struct hmac_sha512_ctx *ctx = HMAC_SHA512_CTX(desc); + + ctx->ctx.ostate = HMAC_SHA512_KEY(desc->tfm)->key.ostate; + return __crypto_sha512_import(&ctx->ctx.sha_ctx, in); +} + /* Algorithm definitions */ static struct shash_alg algs[] = { @@ -186,7 +270,10 @@ static struct shash_alg algs[] = { .update = crypto_sha384_update, .final = crypto_sha384_final, .digest = crypto_sha384_digest, + .export = crypto_sha384_export, + .import = crypto_sha384_import, .descsize = sizeof(struct sha384_ctx), + .statesize = SHA512_SHASH_STATE_SIZE, }, { .base.cra_name = "sha512", @@ -199,7 +286,10 @@ static struct shash_alg algs[] = { .update = crypto_sha512_update, .final = crypto_sha512_final, .digest = crypto_sha512_digest, + .export = crypto_sha512_export, + .import = crypto_sha512_import, .descsize = sizeof(struct sha512_ctx), + .statesize = SHA512_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha384)", @@ -214,7 +304,10 @@ static struct shash_alg algs[] = { .update = crypto_hmac_sha384_update, .final = crypto_hmac_sha384_final, .digest = crypto_hmac_sha384_digest, + .export = crypto_hmac_sha384_export, + .import = crypto_hmac_sha384_import, .descsize = sizeof(struct hmac_sha384_ctx), + .statesize = SHA512_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha512)", @@ -229,7 +322,10 @@ static struct shash_alg algs[] = { .update = crypto_hmac_sha512_update, .final = crypto_hmac_sha512_final, .digest = crypto_hmac_sha512_digest, + .export = crypto_hmac_sha512_export, + .import = crypto_hmac_sha512_import, .descsize = sizeof(struct hmac_sha512_ctx), + .statesize = SHA512_SHASH_STATE_SIZE, }, }; -- cgit v1.2.3 From 24c91b62ac50a798ceabb3482efbca3e0e88a2db Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:11 -0700 Subject: lib/crypto: arm/sha512: Migrate optimized SHA-512 code to library Instead of exposing the arm-optimized SHA-512 code via arm-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be arm-optimized, and it fixes the longstanding issue where the arm-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha512_blocks(), change the type of the nblocks parameter of the assembly functions from int to size_t. The assembly functions actually already treated it as size_t. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/configs/exynos_defconfig | 1 - arch/arm/configs/milbeaut_m10v_defconfig | 1 - arch/arm/configs/multi_v7_defconfig | 1 - arch/arm/configs/omap2plus_defconfig | 1 - arch/arm/configs/pxa_defconfig | 1 - arch/arm/crypto/Kconfig | 10 - arch/arm/crypto/Makefile | 15 - arch/arm/crypto/sha512-armv4.pl | 657 ------------------------------- arch/arm/crypto/sha512-glue.c | 110 ------ arch/arm/crypto/sha512-neon-glue.c | 75 ---- arch/arm/crypto/sha512.h | 3 - lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 14 + lib/crypto/arm/.gitignore | 2 + lib/crypto/arm/sha512-armv4.pl | 657 +++++++++++++++++++++++++++++++ lib/crypto/arm/sha512.h | 38 ++ 16 files changed, 712 insertions(+), 875 deletions(-) delete mode 100644 arch/arm/crypto/sha512-armv4.pl delete mode 100644 arch/arm/crypto/sha512-glue.c delete mode 100644 arch/arm/crypto/sha512-neon-glue.c delete mode 100644 arch/arm/crypto/sha512.h create mode 100644 lib/crypto/arm/.gitignore create mode 100644 lib/crypto/arm/sha512-armv4.pl create mode 100644 lib/crypto/arm/sha512.h diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index f71af368674c..d58e30069304 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_SHA1_ARM_NEON=m -CONFIG_CRYPTO_SHA512_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index 242e7d5a3f68..8ebf8bd872fe 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -100,7 +100,6 @@ CONFIG_CRYPTO_SEQIV=m CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_SHA1_ARM_NEON=m CONFIG_CRYPTO_SHA1_ARM_CE=m -CONFIG_CRYPTO_SHA512_ARM=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 50c170b4619f..3fd07e864ca8 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1282,7 +1282,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_SHA1_ARM_NEON=m CONFIG_CRYPTO_SHA1_ARM_CE=m -CONFIG_CRYPTO_SHA512_ARM=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 9f9780c8e62a..530dfb8338c9 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -705,7 +705,6 @@ CONFIG_SECURITY=y CONFIG_CRYPTO_MICHAEL_MIC=y CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_SHA1_ARM_NEON=m -CONFIG_CRYPTO_SHA512_ARM=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_CHACHA20_NEON=m diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index ff29c5b0e9c9..eaa44574d4a6 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -659,7 +659,6 @@ CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_DEFLATE=y CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_SHA1_ARM=m -CONFIG_CRYPTO_SHA512_ARM=m CONFIG_CRYPTO_AES_ARM=m CONFIG_FONTS=y CONFIG_FONT_8x8=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 7efb9a8596e4..a18f97f1597c 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -93,16 +93,6 @@ config CRYPTO_SHA1_ARM_CE Architecture: arm using ARMv8 Crypto Extensions -config CRYPTO_SHA512_ARM - tristate "Hash functions: SHA-384 and SHA-512 (NEON)" - select CRYPTO_HASH - depends on !CPU_V7M - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - config CRYPTO_AES_ARM tristate "Ciphers: AES" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 8479137c6e80..78a4042d8761 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o -obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o @@ -20,23 +19,9 @@ aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o sha1-arm-y := sha1-armv4-large.o sha1_glue.o sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o -sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o -sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y) blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o curve25519-neon-y := curve25519-core.o curve25519-glue.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += sha512-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -AFLAGS_sha512-core.o += $(aflags-thumb2-y) diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl deleted file mode 100644 index 2fc3516912fa..000000000000 --- a/arch/arm/crypto/sha512-armv4.pl +++ /dev/null @@ -1,657 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# SHA512 block procedure for ARMv4. September 2007. - -# This code is ~4.5 (four and a half) times faster than code generated -# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue -# Xscale PXA250 core]. -# -# July 2010. -# -# Rescheduling for dual-issue pipeline resulted in 6% improvement on -# Cortex A8 core and ~40 cycles per processed byte. - -# February 2011. -# -# Profiler-assisted and platform-specific optimization resulted in 7% -# improvement on Coxtex A8 core and ~38 cycles per byte. - -# March 2011. -# -# Add NEON implementation. On Cortex A8 it was measured to process -# one byte in 23.3 cycles or ~60% faster than integer-only code. - -# August 2012. -# -# Improve NEON performance by 12% on Snapdragon S4. In absolute -# terms it's 22.6 cycles per byte, which is disappointing result. -# Technical writers asserted that 3-way S4 pipeline can sustain -# multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html -# for further details. On side note Cortex-A15 processes one byte in -# 16 cycles. - -# Byte order [in]dependence. ========================================= -# -# Originally caller was expected to maintain specific *dword* order in -# h[0-7], namely with most significant dword at *lower* address, which -# was reflected in below two parameters as 0 and 4. Now caller is -# expected to maintain native byte order for whole 64-bit values. -$hi="HI"; -$lo="LO"; -# ==================================================================== - -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -$ctx="r0"; # parameter block -$inp="r1"; -$len="r2"; - -$Tlo="r3"; -$Thi="r4"; -$Alo="r5"; -$Ahi="r6"; -$Elo="r7"; -$Ehi="r8"; -$t0="r9"; -$t1="r10"; -$t2="r11"; -$t3="r12"; -############ r13 is stack pointer -$Ktbl="r14"; -############ r15 is program counter - -$Aoff=8*0; -$Boff=8*1; -$Coff=8*2; -$Doff=8*3; -$Eoff=8*4; -$Foff=8*5; -$Goff=8*6; -$Hoff=8*7; -$Xoff=8*8; - -sub BODY_00_15() { -my $magic = shift; -$code.=<<___; - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov $t0,$Elo,lsr#14 - str $Tlo,[sp,#$Xoff+0] - mov $t1,$Ehi,lsr#14 - str $Thi,[sp,#$Xoff+4] - eor $t0,$t0,$Ehi,lsl#18 - ldr $t2,[sp,#$Hoff+0] @ h.lo - eor $t1,$t1,$Elo,lsl#18 - ldr $t3,[sp,#$Hoff+4] @ h.hi - eor $t0,$t0,$Elo,lsr#18 - eor $t1,$t1,$Ehi,lsr#18 - eor $t0,$t0,$Ehi,lsl#14 - eor $t1,$t1,$Elo,lsl#14 - eor $t0,$t0,$Ehi,lsr#9 - eor $t1,$t1,$Elo,lsr#9 - eor $t0,$t0,$Elo,lsl#23 - eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) - adds $Tlo,$Tlo,$t0 - ldr $t0,[sp,#$Foff+0] @ f.lo - adc $Thi,$Thi,$t1 @ T += Sigma1(e) - ldr $t1,[sp,#$Foff+4] @ f.hi - adds $Tlo,$Tlo,$t2 - ldr $t2,[sp,#$Goff+0] @ g.lo - adc $Thi,$Thi,$t3 @ T += h - ldr $t3,[sp,#$Goff+4] @ g.hi - - eor $t0,$t0,$t2 - str $Elo,[sp,#$Eoff+0] - eor $t1,$t1,$t3 - str $Ehi,[sp,#$Eoff+4] - and $t0,$t0,$Elo - str $Alo,[sp,#$Aoff+0] - and $t1,$t1,$Ehi - str $Ahi,[sp,#$Aoff+4] - eor $t0,$t0,$t2 - ldr $t2,[$Ktbl,#$lo] @ K[i].lo - eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t3,[$Ktbl,#$hi] @ K[i].hi - - adds $Tlo,$Tlo,$t0 - ldr $Elo,[sp,#$Doff+0] @ d.lo - adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) - ldr $Ehi,[sp,#$Doff+4] @ d.hi - adds $Tlo,$Tlo,$t2 - and $t0,$t2,#0xff - adc $Thi,$Thi,$t3 @ T += K[i] - adds $Elo,$Elo,$Tlo - ldr $t2,[sp,#$Boff+0] @ b.lo - adc $Ehi,$Ehi,$Thi @ d += T - teq $t0,#$magic - - ldr $t3,[sp,#$Coff+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq $Ktbl,$Ktbl,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov $t0,$Alo,lsr#28 - mov $t1,$Ahi,lsr#28 - eor $t0,$t0,$Ahi,lsl#4 - eor $t1,$t1,$Alo,lsl#4 - eor $t0,$t0,$Ahi,lsr#2 - eor $t1,$t1,$Alo,lsr#2 - eor $t0,$t0,$Alo,lsl#30 - eor $t1,$t1,$Ahi,lsl#30 - eor $t0,$t0,$Ahi,lsr#7 - eor $t1,$t1,$Alo,lsr#7 - eor $t0,$t0,$Alo,lsl#25 - eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) - adds $Tlo,$Tlo,$t0 - and $t0,$Alo,$t2 - adc $Thi,$Thi,$t1 @ T += Sigma0(a) - - ldr $t1,[sp,#$Boff+4] @ b.hi - orr $Alo,$Alo,$t2 - ldr $t2,[sp,#$Coff+4] @ c.hi - and $Alo,$Alo,$t3 - and $t3,$Ahi,$t1 - orr $Ahi,$Ahi,$t1 - orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo - and $Ahi,$Ahi,$t2 - adds $Alo,$Alo,$Tlo - orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc $Ahi,$Ahi,$Thi @ h += T - tst $Ktbl,#1 - add $Ktbl,$Ktbl,#8 -___ -} -$code=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -# define VFP_ABI_PUSH -# define VFP_ABI_POP -#endif - -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else -.code 32 -# endif -#endif - -.type K512,%object -.align 5 -K512: -WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) -WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) -WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) -WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) -WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) -WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) -WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) -WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) -WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) -WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) -WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) -WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) -WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) -WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) -WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) -WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) -WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) -WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) -WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) -WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) -WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) -WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) -WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) -WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) -WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) -WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) -WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) -WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) -WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) -WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) -WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) -WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) -WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) -WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) -WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) -WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) -WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) -WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) -WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) -WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) -.size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order -.skip 32-4 -#else -.skip 32 -#endif - -.global sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: -.Lsha512_block_data_order: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ sha512_block_data_order -#else - adr r3,.Lsha512_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 - bne .LNEON -#endif - add $len,$inp,$len,lsl#7 @ len to point at the end of inp - stmdb sp!,{r4-r12,lr} - sub $Ktbl,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr $Elo,[$ctx,#$Eoff+$lo] - ldr $Ehi,[$ctx,#$Eoff+$hi] - ldr $t0, [$ctx,#$Goff+$lo] - ldr $t1, [$ctx,#$Goff+$hi] - ldr $t2, [$ctx,#$Hoff+$lo] - ldr $t3, [$ctx,#$Hoff+$hi] -.Loop: - str $t0, [sp,#$Goff+0] - str $t1, [sp,#$Goff+4] - str $t2, [sp,#$Hoff+0] - str $t3, [sp,#$Hoff+4] - ldr $Alo,[$ctx,#$Aoff+$lo] - ldr $Ahi,[$ctx,#$Aoff+$hi] - ldr $Tlo,[$ctx,#$Boff+$lo] - ldr $Thi,[$ctx,#$Boff+$hi] - ldr $t0, [$ctx,#$Coff+$lo] - ldr $t1, [$ctx,#$Coff+$hi] - ldr $t2, [$ctx,#$Doff+$lo] - ldr $t3, [$ctx,#$Doff+$hi] - str $Tlo,[sp,#$Boff+0] - str $Thi,[sp,#$Boff+4] - str $t0, [sp,#$Coff+0] - str $t1, [sp,#$Coff+4] - str $t2, [sp,#$Doff+0] - str $t3, [sp,#$Doff+4] - ldr $Tlo,[$ctx,#$Foff+$lo] - ldr $Thi,[$ctx,#$Foff+$hi] - str $Tlo,[sp,#$Foff+0] - str $Thi,[sp,#$Foff+4] - -.L00_15: -#if __ARM_ARCH__<7 - ldrb $Tlo,[$inp,#7] - ldrb $t0, [$inp,#6] - ldrb $t1, [$inp,#5] - ldrb $t2, [$inp,#4] - ldrb $Thi,[$inp,#3] - ldrb $t3, [$inp,#2] - orr $Tlo,$Tlo,$t0,lsl#8 - ldrb $t0, [$inp,#1] - orr $Tlo,$Tlo,$t1,lsl#16 - ldrb $t1, [$inp],#8 - orr $Tlo,$Tlo,$t2,lsl#24 - orr $Thi,$Thi,$t3,lsl#8 - orr $Thi,$Thi,$t0,lsl#16 - orr $Thi,$Thi,$t1,lsl#24 -#else - ldr $Tlo,[$inp,#4] - ldr $Thi,[$inp],#8 -#ifdef __ARMEL__ - rev $Tlo,$Tlo - rev $Thi,$Thi -#endif -#endif -___ - &BODY_00_15(0x94); -$code.=<<___; - tst $Ktbl,#1 - beq .L00_15 - ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] - ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] - bic $Ktbl,$Ktbl,#1 -.L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov $Tlo,$t0,lsr#1 - ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] - mov $Thi,$t1,lsr#1 - ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] - eor $Tlo,$Tlo,$t1,lsl#31 - eor $Thi,$Thi,$t0,lsl#31 - eor $Tlo,$Tlo,$t0,lsr#8 - eor $Thi,$Thi,$t1,lsr#8 - eor $Tlo,$Tlo,$t1,lsl#24 - eor $Thi,$Thi,$t0,lsl#24 - eor $Tlo,$Tlo,$t0,lsr#7 - eor $Thi,$Thi,$t1,lsr#7 - eor $Tlo,$Tlo,$t1,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov $t0,$t2,lsr#19 - mov $t1,$t3,lsr#19 - eor $t0,$t0,$t3,lsl#13 - eor $t1,$t1,$t2,lsl#13 - eor $t0,$t0,$t3,lsr#29 - eor $t1,$t1,$t2,lsr#29 - eor $t0,$t0,$t2,lsl#3 - eor $t1,$t1,$t3,lsl#3 - eor $t0,$t0,$t2,lsr#6 - eor $t1,$t1,$t3,lsr#6 - ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] - eor $t0,$t0,$t3,lsl#26 - - ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] - adds $Tlo,$Tlo,$t0 - ldr $t0,[sp,#`$Xoff+8*16`+0] - adc $Thi,$Thi,$t1 - - ldr $t1,[sp,#`$Xoff+8*16`+4] - adds $Tlo,$Tlo,$t2 - adc $Thi,$Thi,$t3 - adds $Tlo,$Tlo,$t0 - adc $Thi,$Thi,$t1 -___ - &BODY_00_15(0x17); -$code.=<<___; -#if __ARM_ARCH__>=7 - ittt eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] - ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] - beq .L16_79 - bic $Ktbl,$Ktbl,#1 - - ldr $Tlo,[sp,#$Boff+0] - ldr $Thi,[sp,#$Boff+4] - ldr $t0, [$ctx,#$Aoff+$lo] - ldr $t1, [$ctx,#$Aoff+$hi] - ldr $t2, [$ctx,#$Boff+$lo] - ldr $t3, [$ctx,#$Boff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Aoff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Aoff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Boff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Boff+$hi] - - ldr $Alo,[sp,#$Coff+0] - ldr $Ahi,[sp,#$Coff+4] - ldr $Tlo,[sp,#$Doff+0] - ldr $Thi,[sp,#$Doff+4] - ldr $t0, [$ctx,#$Coff+$lo] - ldr $t1, [$ctx,#$Coff+$hi] - ldr $t2, [$ctx,#$Doff+$lo] - ldr $t3, [$ctx,#$Doff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Coff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Coff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Doff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Doff+$hi] - - ldr $Tlo,[sp,#$Foff+0] - ldr $Thi,[sp,#$Foff+4] - ldr $t0, [$ctx,#$Eoff+$lo] - ldr $t1, [$ctx,#$Eoff+$hi] - ldr $t2, [$ctx,#$Foff+$lo] - ldr $t3, [$ctx,#$Foff+$hi] - adds $Elo,$Elo,$t0 - str $Elo,[$ctx,#$Eoff+$lo] - adc $Ehi,$Ehi,$t1 - str $Ehi,[$ctx,#$Eoff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Foff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Foff+$hi] - - ldr $Alo,[sp,#$Goff+0] - ldr $Ahi,[sp,#$Goff+4] - ldr $Tlo,[sp,#$Hoff+0] - ldr $Thi,[sp,#$Hoff+4] - ldr $t0, [$ctx,#$Goff+$lo] - ldr $t1, [$ctx,#$Goff+$hi] - ldr $t2, [$ctx,#$Hoff+$lo] - ldr $t3, [$ctx,#$Hoff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Goff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Goff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Hoff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Hoff+$hi] - - add sp,sp,#640 - sub $Ktbl,$Ktbl,#640 - - teq $inp,$len - bne .Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size sha512_block_data_order,.-sha512_block_data_order -___ - -{ -my @Sigma0=(28,34,39); -my @Sigma1=(14,18,41); -my @sigma0=(1, 8, 7); -my @sigma1=(19,61,6); - -my $Ktbl="r3"; -my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch - -my @X=map("d$_",(0..15)); -my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); - -sub NEON_00_15() { -my $i=shift; -my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; -my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps - -$code.=<<___ if ($i<16 || $i&1); - vshr.u64 $t0,$e,#@Sigma1[0] @ $i -#if $i<16 - vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned -#endif - vshr.u64 $t1,$e,#@Sigma1[1] -#if $i>0 - vadd.i64 $a,$Maj @ h+=Maj from the past -#endif - vshr.u64 $t2,$e,#@Sigma1[2] -___ -$code.=<<___; - vld1.64 {$K},[$Ktbl,:64]! @ K[i++] - vsli.64 $t0,$e,#`64-@Sigma1[0]` - vsli.64 $t1,$e,#`64-@Sigma1[1]` - vmov $Ch,$e - vsli.64 $t2,$e,#`64-@Sigma1[2]` -#if $i<16 && defined(__ARMEL__) - vrev64.8 @X[$i],@X[$i] -#endif - veor $t1,$t0 - vbsl $Ch,$f,$g @ Ch(e,f,g) - vshr.u64 $t0,$a,#@Sigma0[0] - veor $t2,$t1 @ Sigma1(e) - vadd.i64 $T1,$Ch,$h - vshr.u64 $t1,$a,#@Sigma0[1] - vsli.64 $t0,$a,#`64-@Sigma0[0]` - vadd.i64 $T1,$t2 - vshr.u64 $t2,$a,#@Sigma0[2] - vadd.i64 $K,@X[$i%16] - vsli.64 $t1,$a,#`64-@Sigma0[1]` - veor $Maj,$a,$b - vsli.64 $t2,$a,#`64-@Sigma0[2]` - veor $h,$t0,$t1 - vadd.i64 $T1,$K - vbsl $Maj,$c,$b @ Maj(a,b,c) - veor $h,$t2 @ Sigma0(a) - vadd.i64 $d,$T1 - vadd.i64 $Maj,$T1 - @ vadd.i64 $h,$Maj -___ -} - -sub NEON_16_79() { -my $i=shift; - -if ($i&1) { &NEON_00_15($i,@_); return; } - -# 2x-vectorized, therefore runs every 2nd round -my @X=map("q$_",(0..7)); # view @X as 128-bit vector -my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps -my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 -my $e=@_[4]; # $e from NEON_00_15 -$i /= 2; -$code.=<<___; - vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] - vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] - vadd.i64 @_[0],d30 @ h+=Maj from the past - vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] - vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` - vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] - vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` - veor $s1,$t0 - vshr.u64 $t0,$s0,#@sigma0[0] - veor $s1,$t1 @ sigma1(X[i+14]) - vshr.u64 $t1,$s0,#@sigma0[1] - vadd.i64 @X[$i%8],$s1 - vshr.u64 $s1,$s0,#@sigma0[2] - vsli.64 $t0,$s0,#`64-@sigma0[0]` - vsli.64 $t1,$s0,#`64-@sigma0[1]` - vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] - veor $s1,$t0 - vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 - vadd.i64 @X[$i%8],$s0 - vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 - veor $s1,$t1 @ sigma0(X[i+1]) - vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 - vadd.i64 @X[$i%8],$s1 -___ - &NEON_00_15(2*$i,@_); -} - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.global sha512_block_data_order_neon -.type sha512_block_data_order_neon,%function -.align 4 -sha512_block_data_order_neon: -.LNEON: - dmb @ errata #451034 on early Cortex A8 - add $len,$inp,$len,lsl#7 @ len to point at the end of inp - VFP_ABI_PUSH - adr $Ktbl,.Lsha512_block_data_order - sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 - vldmia $ctx,{$A-$H} @ load context -.Loop_neon: -___ -for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - mov $cnt,#4 -.L16_79_neon: - subs $cnt,#1 -___ -for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - bne .L16_79_neon - - vadd.i64 $A,d30 @ h+=Maj from the past - vldmia $ctx,{d24-d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia $ctx,{$A-$H} @ save context - teq $inp,$len - sub $Ktbl,#640 @ rewind K512 - bne .Loop_neon - - VFP_ABI_POP - ret @ bx lr -.size sha512_block_data_order_neon,.-sha512_block_data_order_neon -#endif -___ -} -$code.=<<___; -.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; - -open SELF,$0; -while() { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - -print $code; -close STDOUT; # enforce flush diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c deleted file mode 100644 index f8a6480889b1..000000000000 --- a/arch/arm/crypto/sha512-glue.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha512-glue.c - accelerated SHA-384/512 for ARM - * - * Copyright (C) 2015 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "sha512.h" - -MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); - -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha384-arm"); -MODULE_ALIAS_CRYPTO("sha512-arm"); - -asmlinkage void sha512_block_data_order(struct sha512_state *state, - u8 const *src, int blocks); - -static int sha512_arm_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_block_data_order); -} - -static int sha512_arm_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_block_data_order); - return sha512_base_finish(desc, out); -} - -static struct shash_alg sha512_arm_algs[] = { { - .init = sha384_base_init, - .update = sha512_arm_update, - .finup = sha512_arm_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA384_DIGEST_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-arm", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .init = sha512_base_init, - .update = sha512_arm_update, - .finup = sha512_arm_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA512_DIGEST_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-arm", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha512_arm_mod_init(void) -{ - int err; - - err = crypto_register_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - if (err) - return err; - - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) { - err = crypto_register_shashes(sha512_neon_algs, - ARRAY_SIZE(sha512_neon_algs)); - if (err) - goto err_unregister; - } - return 0; - -err_unregister: - crypto_unregister_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - - return err; -} - -static void __exit sha512_arm_mod_fini(void) -{ - crypto_unregister_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) - crypto_unregister_shashes(sha512_neon_algs, - ARRAY_SIZE(sha512_neon_algs)); -} - -module_init(sha512_arm_mod_init); -module_exit(sha512_arm_mod_fini); diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c deleted file mode 100644 index bd528077fefb..000000000000 --- a/arch/arm/crypto/sha512-neon-glue.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON - * - * Copyright (C) 2015 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include - -#include "sha512.h" - -MODULE_ALIAS_CRYPTO("sha384-neon"); -MODULE_ALIAS_CRYPTO("sha512-neon"); - -asmlinkage void sha512_block_data_order_neon(struct sha512_state *state, - const u8 *src, int blocks); - -static int sha512_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - int remain; - - kernel_neon_begin(); - remain = sha512_base_do_update_blocks(desc, data, len, - sha512_block_data_order_neon); - kernel_neon_end(); - return remain; -} - -static int sha512_neon_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - kernel_neon_begin(); - sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon); - kernel_neon_end(); - return sha512_base_finish(desc, out); -} - -struct shash_alg sha512_neon_algs[] = { { - .init = sha384_base_init, - .update = sha512_neon_update, - .finup = sha512_neon_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA384_DIGEST_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-neon", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - - } -}, { - .init = sha512_base_init, - .update = sha512_neon_update, - .finup = sha512_neon_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA512_DIGEST_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-neon", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h deleted file mode 100644 index eeaee52cda69..000000000000 --- a/arch/arm/crypto/sha512.h +++ /dev/null @@ -1,3 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -extern struct shash_alg sha512_neon_algs[2]; diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index d1bee3787eb3..dac6356ba0aa 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -177,6 +177,7 @@ config CRYPTO_LIB_SHA512 config CRYPTO_LIB_SHA512_ARCH bool depends on CRYPTO_LIB_SHA512 && !UML + default y if ARM && !CPU_V7M config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index f6b6f370451e..67008a1612c6 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -1,5 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) > $(@) + obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o @@ -68,6 +73,15 @@ obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o libsha512-y := sha512.o ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y) CFLAGS_sha512.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libsha512-y += arm/sha512-core.o +$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl + $(call cmd,perlasm) +clean-files += arm/sha512-core.S +AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y) +endif + endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore new file mode 100644 index 000000000000..670a4d97b568 --- /dev/null +++ b/lib/crypto/arm/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +sha512-core.S diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl new file mode 100644 index 000000000000..2fc3516912fa --- /dev/null +++ b/lib/crypto/arm/sha512-armv4.pl @@ -0,0 +1,657 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA512 block procedure for ARMv4. September 2007. + +# This code is ~4.5 (four and a half) times faster than code generated +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 23.3 cycles or ~60% faster than integer-only code. + +# August 2012. +# +# Improve NEON performance by 12% on Snapdragon S4. In absolute +# terms it's 22.6 cycles per byte, which is disappointing result. +# Technical writers asserted that 3-way S4 pipeline can sustain +# multiple NEON instructions per cycle, but dual NEON issue could +# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. + +# Byte order [in]dependence. ========================================= +# +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; +# ==================================================================== + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; # parameter block +$inp="r1"; +$len="r2"; + +$Tlo="r3"; +$Thi="r4"; +$Alo="r5"; +$Ahi="r6"; +$Elo="r7"; +$Ehi="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +############ r13 is stack pointer +$Ktbl="r14"; +############ r15 is program counter + +$Aoff=8*0; +$Boff=8*1; +$Coff=8*2; +$Doff=8*3; +$Eoff=8*4; +$Foff=8*5; +$Goff=8*6; +$Hoff=8*7; +$Xoff=8*8; + +sub BODY_00_15() { +my $magic = shift; +$code.=<<___; + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] + mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] + eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo + eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi + eor $t0,$t0,$Elo,lsr#18 + eor $t1,$t1,$Ehi,lsr#18 + eor $t0,$t0,$Ehi,lsl#14 + eor $t1,$t1,$Elo,lsl#14 + eor $t0,$t0,$Ehi,lsr#9 + eor $t1,$t1,$Elo,lsr#9 + eor $t0,$t0,$Elo,lsl#23 + eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) + ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 + ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h + ldr $t3,[sp,#$Goff+4] @ g.hi + + eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] + eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] + and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] + and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] + eor $t0,$t0,$t2 + ldr $t2,[$Ktbl,#$lo] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) + ldr $t3,[$Ktbl,#$hi] @ K[i].hi + + adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo + adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi + adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff + adc $Thi,$Thi,$t3 @ T += K[i] + adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo + adc $Ehi,$Ehi,$Thi @ d += T + teq $t0,#$magic + + ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq $Ktbl,$Ktbl,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov $t0,$Alo,lsr#28 + mov $t1,$Ahi,lsr#28 + eor $t0,$t0,$Ahi,lsl#4 + eor $t1,$t1,$Alo,lsl#4 + eor $t0,$t0,$Ahi,lsr#2 + eor $t1,$t1,$Alo,lsr#2 + eor $t0,$t0,$Alo,lsl#30 + eor $t1,$t1,$Ahi,lsl#30 + eor $t0,$t0,$Ahi,lsr#7 + eor $t1,$t1,$Alo,lsr#7 + eor $t0,$t0,$Alo,lsl#25 + eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) + adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 + adc $Thi,$Thi,$t1 @ T += Sigma0(a) + + ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 + ldr $t2,[sp,#$Coff+4] @ c.hi + and $Alo,$Alo,$t3 + and $t3,$Ahi,$t1 + orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo + and $Ahi,$Ahi,$t2 + adds $Alo,$Alo,$Tlo + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 + add $Ktbl,$Ktbl,#8 +___ +} +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha512_block_data_order +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4-r12,lr} + sub $Ktbl,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr $Elo,[$ctx,#$Eoff+$lo] + ldr $Ehi,[$ctx,#$Eoff+$hi] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] +.Loop: + str $t0, [sp,#$Goff+0] + str $t1, [sp,#$Goff+4] + str $t2, [sp,#$Hoff+0] + str $t3, [sp,#$Hoff+4] + ldr $Alo,[$ctx,#$Aoff+$lo] + ldr $Ahi,[$ctx,#$Aoff+$hi] + ldr $Tlo,[$ctx,#$Boff+$lo] + ldr $Thi,[$ctx,#$Boff+$hi] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + str $Tlo,[sp,#$Boff+0] + str $Thi,[sp,#$Boff+4] + str $t0, [sp,#$Coff+0] + str $t1, [sp,#$Coff+4] + str $t2, [sp,#$Doff+0] + str $t3, [sp,#$Doff+4] + ldr $Tlo,[$ctx,#$Foff+$lo] + ldr $Thi,[$ctx,#$Foff+$hi] + str $Tlo,[sp,#$Foff+0] + str $Thi,[sp,#$Foff+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb $Tlo,[$inp,#7] + ldrb $t0, [$inp,#6] + ldrb $t1, [$inp,#5] + ldrb $t2, [$inp,#4] + ldrb $Thi,[$inp,#3] + ldrb $t3, [$inp,#2] + orr $Tlo,$Tlo,$t0,lsl#8 + ldrb $t0, [$inp,#1] + orr $Tlo,$Tlo,$t1,lsl#16 + ldrb $t1, [$inp],#8 + orr $Tlo,$Tlo,$t2,lsl#24 + orr $Thi,$Thi,$t3,lsl#8 + orr $Thi,$Thi,$t0,lsl#16 + orr $Thi,$Thi,$t1,lsl#24 +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif +___ + &BODY_00_15(0x94); +$code.=<<___; + tst $Ktbl,#1 + beq .L00_15 + ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] + bic $Ktbl,$Ktbl,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] + mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] + eor $Tlo,$Tlo,$t1,lsl#31 + eor $Thi,$Thi,$t0,lsl#31 + eor $Tlo,$Tlo,$t0,lsr#8 + eor $Thi,$Thi,$t1,lsr#8 + eor $Tlo,$Tlo,$t1,lsl#24 + eor $Thi,$Thi,$t0,lsl#24 + eor $Tlo,$Tlo,$t0,lsr#7 + eor $Thi,$Thi,$t1,lsr#7 + eor $Tlo,$Tlo,$t1,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov $t0,$t2,lsr#19 + mov $t1,$t3,lsr#19 + eor $t0,$t0,$t3,lsl#13 + eor $t1,$t1,$t2,lsl#13 + eor $t0,$t0,$t3,lsr#29 + eor $t1,$t1,$t2,lsr#29 + eor $t0,$t0,$t2,lsl#3 + eor $t1,$t1,$t3,lsl#3 + eor $t0,$t0,$t2,lsr#6 + eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] + eor $t0,$t0,$t3,lsl#26 + + ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] + adc $Thi,$Thi,$t1 + + ldr $t1,[sp,#`$Xoff+8*16`+4] + adds $Tlo,$Tlo,$t2 + adc $Thi,$Thi,$t3 + adds $Tlo,$Tlo,$t0 + adc $Thi,$Thi,$t1 +___ + &BODY_00_15(0x17); +$code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] + beq .L16_79 + bic $Ktbl,$Ktbl,#1 + + ldr $Tlo,[sp,#$Boff+0] + ldr $Thi,[sp,#$Boff+4] + ldr $t0, [$ctx,#$Aoff+$lo] + ldr $t1, [$ctx,#$Aoff+$hi] + ldr $t2, [$ctx,#$Boff+$lo] + ldr $t3, [$ctx,#$Boff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Boff+$hi] + + ldr $Alo,[sp,#$Coff+0] + ldr $Ahi,[sp,#$Coff+4] + ldr $Tlo,[sp,#$Doff+0] + ldr $Thi,[sp,#$Doff+4] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Doff+$hi] + + ldr $Tlo,[sp,#$Foff+0] + ldr $Thi,[sp,#$Foff+4] + ldr $t0, [$ctx,#$Eoff+$lo] + ldr $t1, [$ctx,#$Eoff+$hi] + ldr $t2, [$ctx,#$Foff+$lo] + ldr $t3, [$ctx,#$Foff+$hi] + adds $Elo,$Elo,$t0 + str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 + str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Foff+$hi] + + ldr $Alo,[sp,#$Goff+0] + ldr $Ahi,[sp,#$Goff+4] + ldr $Tlo,[sp,#$Hoff+0] + ldr $Thi,[sp,#$Hoff+4] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Hoff+$hi] + + add sp,sp,#640 + sub $Ktbl,$Ktbl,#640 + + teq $inp,$len + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order,.-sha512_block_data_order +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] +#if $i>0 + vadd.i64 $a,$Maj @ h+=Maj from the past +#endif + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vmov $Ch,$e + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + veor $t1,$t0 + vbsl $Ch,$f,$g @ Ch(e,f,g) + vshr.u64 $t0,$a,#@Sigma0[0] + veor $t2,$t1 @ Sigma1(e) + vadd.i64 $T1,$Ch,$h + vshr.u64 $t1,$a,#@Sigma0[1] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vadd.i64 $T1,$t2 + vshr.u64 $t2,$a,#@Sigma0[2] + vadd.i64 $K,@X[$i%16] + vsli.64 $t1,$a,#`64-@Sigma0[1]` + veor $Maj,$a,$b + vsli.64 $t2,$a,#`64-@Sigma0[2]` + veor $h,$t0,$t1 + vadd.i64 $T1,$K + vbsl $Maj,$c,$b @ Maj(a,b,c) + veor $h,$t2 @ Sigma0(a) + vadd.i64 $d,$T1 + vadd.i64 $Maj,$T1 + @ vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vadd.i64 @_[0],d30 @ h+=Maj from the past + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + VFP_ABI_PUSH + adr $Ktbl,.Lsha512_block_data_order + sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vadd.i64 $A,d30 @ h+=Maj from the past + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +___ +} +$code.=<<___; +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h new file mode 100644 index 000000000000..f147b6490d6c --- /dev/null +++ b/lib/crypto/arm/sha512.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm32-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(crypto_simd_usable())) { + kernel_neon_begin(); + sha512_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha512_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_neon()) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ -- cgit v1.2.3 From 60e3f1e9b7a57567c2f3b3ae013e3e292cf6d115 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:12 -0700 Subject: lib/crypto: arm64/sha512: Migrate optimized SHA-512 code to library Instead of exposing the arm64-optimized SHA-512 code via arm64-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be arm64-optimized, and it fixes the longstanding issue where the arm64-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha512_blocks(), change the type of the nblocks parameter of the assembly functions from int or 'unsigned int' to size_t. Update the ARMv8 CE assembly function accordingly. The scalar assembly function actually already treated it as size_t. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/configs/defconfig | 1 - arch/arm64/crypto/Kconfig | 19 ---- arch/arm64/crypto/Makefile | 14 --- arch/arm64/crypto/sha512-ce-core.S | 206 ------------------------------------- arch/arm64/crypto/sha512-ce-glue.c | 96 ----------------- arch/arm64/crypto/sha512-glue.c | 83 --------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 10 ++ lib/crypto/arm64/.gitignore | 2 + lib/crypto/arm64/sha512-ce-core.S | 206 +++++++++++++++++++++++++++++++++++++ lib/crypto/arm64/sha512.h | 46 +++++++++ 11 files changed, 265 insertions(+), 419 deletions(-) delete mode 100644 arch/arm64/crypto/sha512-ce-core.S delete mode 100644 arch/arm64/crypto/sha512-ce-glue.c delete mode 100644 arch/arm64/crypto/sha512-glue.c create mode 100644 lib/crypto/arm64/.gitignore create mode 100644 lib/crypto/arm64/sha512-ce-core.S create mode 100644 lib/crypto/arm64/sha512.h diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 897fc686e6a9..b612b78b3b09 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1744,7 +1744,6 @@ CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_GHASH_ARM64_CE=y CONFIG_CRYPTO_SHA1_ARM64_CE=y -CONFIG_CRYPTO_SHA512_ARM64_CE=m CONFIG_CRYPTO_SHA3_ARM64=m CONFIG_CRYPTO_SM3_ARM64_CE=m CONFIG_CRYPTO_AES_ARM64_CE_BLK=y diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index c44b0f202a1f..a9ead99f72c2 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -36,25 +36,6 @@ config CRYPTO_SHA1_ARM64_CE Architecture: arm64 using: - ARMv8 Crypto Extensions -config CRYPTO_SHA512_ARM64 - tristate "Hash functions: SHA-384 and SHA-512" - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm64 - -config CRYPTO_SHA512_ARM64_CE - tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA512_ARM64 - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - config CRYPTO_SHA3_ARM64 tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index c231c980c514..228101f125d5 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -8,9 +8,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o -obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o -sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o - obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o @@ -53,9 +50,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o aes-neon-blk-y := aes-glue-neon.o aes-neon.o -obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o -sha512-arm64-y := sha512-glue.o sha512-core.o - obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o @@ -64,11 +58,3 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl - $(call cmd,perlasm) - -clean-files += sha512-core.S diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S deleted file mode 100644 index 91ef68b15fcc..000000000000 --- a/arch/arm64/crypto/sha512-ce-core.S +++ /dev/null @@ -1,206 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include - - .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 - .set .Lq\b, \b - .set .Lv\b\().2d, \b - .endr - - .macro sha512h, rd, rn, rm - .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512h2, rd, rn, rm - .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512su0, rd, rn - .inst 0xcec08000 | .L\rd | (.L\rn << 5) - .endm - - .macro sha512su1, rd, rn, rm - .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - /* - * The SHA-512 round constants - */ - .section ".rodata", "a" - .align 4 -.Lsha512_rcon: - .quad 0x428a2f98d728ae22, 0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538, 0x59f111f1b605d019 - .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242, 0x12835b0145706fbe - .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 - .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 - .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 - .quad 0x06ca6351e003826f, 0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df - .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6, 0x92722c851482353b - .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 - .quad 0xd192e819d6ef5218, 0xd69906245565a910 - .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec - .quad 0x90befffa23631e28, 0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b - .quad 0xca273eceea26619c, 0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae, 0x1b710b35131c471b - .quad 0x28db77f523047d84, 0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 - - .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 - .ifnb \rc1 - ld1 {v\rc1\().2d}, [x4], #16 - .endif - add v5.2d, v\rc0\().2d, v\in0\().2d - ext v6.16b, v\i2\().16b, v\i3\().16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - ext v7.16b, v\i1\().16b, v\i2\().16b, #8 - add v\i3\().2d, v\i3\().2d, v5.2d - .ifnb \in1 - ext v5.16b, v\in3\().16b, v\in4\().16b, #8 - sha512su0 v\in0\().2d, v\in1\().2d - .endif - sha512h q\i3, q6, v7.2d - .ifnb \in1 - sha512su1 v\in0\().2d, v\in2\().2d, v5.2d - .endif - add v\i4\().2d, v\i1\().2d, v\i3\().2d - sha512h2 q\i3, q\i1, v\i0\().2d - .endm - - /* - * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src, - * int blocks) - */ - .text -SYM_FUNC_START(__sha512_ce_transform) - /* load state */ - ld1 {v8.2d-v11.2d}, [x0] - - /* load first 4 round constants */ - adr_l x3, .Lsha512_rcon - ld1 {v20.2d-v23.2d}, [x3], #64 - - /* load input */ -0: ld1 {v12.2d-v15.2d}, [x1], #64 - ld1 {v16.2d-v19.2d}, [x1], #64 - sub w2, w2, #1 - -CPU_LE( rev64 v12.16b, v12.16b ) -CPU_LE( rev64 v13.16b, v13.16b ) -CPU_LE( rev64 v14.16b, v14.16b ) -CPU_LE( rev64 v15.16b, v15.16b ) -CPU_LE( rev64 v16.16b, v16.16b ) -CPU_LE( rev64 v17.16b, v17.16b ) -CPU_LE( rev64 v18.16b, v18.16b ) -CPU_LE( rev64 v19.16b, v19.16b ) - - mov x4, x3 // rc pointer - - mov v0.16b, v8.16b - mov v1.16b, v9.16b - mov v2.16b, v10.16b - mov v3.16b, v11.16b - - // v0 ab cd -- ef gh ab - // v1 cd -- ef gh ab cd - // v2 ef gh ab cd -- ef - // v3 gh ab cd -- ef gh - // v4 -- ef gh ab cd -- - - dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 - dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 - dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 - dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 - dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 - - dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 - dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 - dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 - dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 - dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 - - dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 - dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 - dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 - dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 - dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 - - dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 - dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 - dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 - dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 - dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 - - dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 - dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 - dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 - dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 - dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 - - dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 - dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 - dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 - dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 - dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 - - dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 - dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 - dround 2, 3, 1, 4, 0, 28, 24, 12 - dround 4, 2, 0, 1, 3, 29, 25, 13 - dround 1, 4, 3, 0, 2, 30, 26, 14 - - dround 0, 1, 2, 3, 4, 31, 27, 15 - dround 3, 0, 4, 2, 1, 24, , 16 - dround 2, 3, 1, 4, 0, 25, , 17 - dround 4, 2, 0, 1, 3, 26, , 18 - dround 1, 4, 3, 0, 2, 27, , 19 - - /* update state */ - add v8.2d, v8.2d, v0.2d - add v9.2d, v9.2d, v1.2d - add v10.2d, v10.2d, v2.2d - add v11.2d, v11.2d, v3.2d - - cond_yield 3f, x4, x5 - /* handled all input blocks? */ - cbnz w2, 0b - - /* store new state */ -3: st1 {v8.2d-v11.2d}, [x0] - mov w0, w2 - ret -SYM_FUNC_END(__sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c deleted file mode 100644 index 6fb3001fa2c9..000000000000 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); - -asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks); - -static void sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks) -{ - do { - int rem; - - kernel_neon_begin(); - rem = __sha512_ce_transform(sst, src, blocks); - kernel_neon_end(); - src += (blocks - rem) * SHA512_BLOCK_SIZE; - blocks = rem; - } while (blocks); -} - -static int sha512_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_ce_transform); -} - -static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_ce_transform); - return sha512_base_finish(desc, out); -} - -static struct shash_alg algs[] = { { - .init = sha384_base_init, - .update = sha512_ce_update, - .finup = sha512_ce_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA384_DIGEST_SIZE, - .base.cra_name = "sha384", - .base.cra_driver_name = "sha384-ce", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .init = sha512_base_init, - .update = sha512_ce_update, - .finup = sha512_ce_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA512_DIGEST_SIZE, - .base.cra_name = "sha512", - .base.cra_driver_name = "sha512-ce", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int __init sha512_ce_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha512_ce_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_cpu_feature_match(SHA512, sha512_ce_mod_init); -module_exit(sha512_ce_mod_fini); diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c deleted file mode 100644 index a78e184c100f..000000000000 --- a/arch/arm64/crypto/sha512-glue.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64 - * - * Copyright (c) 2016 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64"); -MODULE_AUTHOR("Andy Polyakov "); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); - -asmlinkage void sha512_blocks_arch(u64 *digest, const void *data, - unsigned int num_blks); - -static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src, - int blocks) -{ - sha512_blocks_arch(sst->state, src, blocks); -} - -static int sha512_update_arm64(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_arm64_transform); -} - -static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_arm64_transform); - return sha512_base_finish(desc, out); -} - -static struct shash_alg algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_update_arm64, - .finup = sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base.cra_name = "sha512", - .base.cra_driver_name = "sha512-arm64", - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_update_arm64, - .finup = sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base.cra_name = "sha384", - .base.cra_driver_name = "sha384-arm64", - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA384_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int __init sha512_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha512_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_init(sha512_mod_init); -module_exit(sha512_mod_fini); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index dac6356ba0aa..26413f679fab 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -178,6 +178,7 @@ config CRYPTO_LIB_SHA512_ARCH bool depends on CRYPTO_LIB_SHA512 && !UML default y if ARM && !CPU_V7M + default y if ARM64 config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 67008a1612c6..22269ab06d70 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -5,6 +5,9 @@ aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $(<) > $(@) +quiet_cmd_perlasm_with_args = PERLASM $@ + cmd_perlasm_with_args = $(PERL) $(<) void $(@) + obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o @@ -82,6 +85,13 @@ clean-files += arm/sha512-core.S AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y) endif +ifeq ($(CONFIG_ARM64),y) +libsha512-y += arm64/sha512-core.o +$(obj)/arm64/sha512-core.S: $(src)/../../arch/arm64/lib/crypto/sha2-armv8.pl + $(call cmd,perlasm_with_args) +clean-files += arm64/sha512-core.S +libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o +endif endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore new file mode 100644 index 000000000000..670a4d97b568 --- /dev/null +++ b/lib/crypto/arm64/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +sha512-core.S diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S new file mode 100644 index 000000000000..7d870a435ea3 --- /dev/null +++ b/lib/crypto/arm64/sha512-ce-core.S @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + .set .Lq\b, \b + .set .Lv\b\().2d, \b + .endr + + .macro sha512h, rd, rn, rm + .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512h2, rd, rn, rm + .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512su0, rd, rn + .inst 0xcec08000 | .L\rd | (.L\rn << 5) + .endm + + .macro sha512su1, rd, rn, rm + .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + /* + * The SHA-512 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha512_rcon: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + + .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 + .ifnb \rc1 + ld1 {v\rc1\().2d}, [x4], #16 + .endif + add v5.2d, v\rc0\().2d, v\in0\().2d + ext v6.16b, v\i2\().16b, v\i3\().16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + ext v7.16b, v\i1\().16b, v\i2\().16b, #8 + add v\i3\().2d, v\i3\().2d, v5.2d + .ifnb \in1 + ext v5.16b, v\in3\().16b, v\in4\().16b, #8 + sha512su0 v\in0\().2d, v\in1\().2d + .endif + sha512h q\i3, q6, v7.2d + .ifnb \in1 + sha512su1 v\in0\().2d, v\in2\().2d, v5.2d + .endif + add v\i4\().2d, v\i1\().2d, v\i3\().2d + sha512h2 q\i3, q\i1, v\i0\().2d + .endm + + /* + * size_t __sha512_ce_transform(struct sha512_block_state *state, + * const u8 *data, size_t nblocks); + */ + .text +SYM_FUNC_START(__sha512_ce_transform) + /* load state */ + ld1 {v8.2d-v11.2d}, [x0] + + /* load first 4 round constants */ + adr_l x3, .Lsha512_rcon + ld1 {v20.2d-v23.2d}, [x3], #64 + + /* load input */ +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev64 v12.16b, v12.16b ) +CPU_LE( rev64 v13.16b, v13.16b ) +CPU_LE( rev64 v14.16b, v14.16b ) +CPU_LE( rev64 v15.16b, v15.16b ) +CPU_LE( rev64 v16.16b, v16.16b ) +CPU_LE( rev64 v17.16b, v17.16b ) +CPU_LE( rev64 v18.16b, v18.16b ) +CPU_LE( rev64 v19.16b, v19.16b ) + + mov x4, x3 // rc pointer + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + // v0 ab cd -- ef gh ab + // v1 cd -- ef gh ab cd + // v2 ef gh ab cd -- ef + // v3 gh ab cd -- ef gh + // v4 -- ef gh ab cd -- + + dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 + dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 + dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 + dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 + dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 + + dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 + dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 + dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 + dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 + dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 + + dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 + dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 + dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 + dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 + dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 + + dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 + dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 + dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 + dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 + dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 + + dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 + dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 + dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 + dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 + dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 + + dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 + dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 + dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 + dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 + dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 + + dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 + dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 + dround 2, 3, 1, 4, 0, 28, 24, 12 + dround 4, 2, 0, 1, 3, 29, 25, 13 + dround 1, 4, 3, 0, 2, 30, 26, 14 + + dround 0, 1, 2, 3, 4, 31, 27, 15 + dround 3, 0, 4, 2, 1, 24, , 16 + dround 2, 3, 1, 4, 0, 25, , 17 + dround 4, 2, 0, 1, 3, 26, , 18 + dround 1, 4, 3, 0, 2, 27, , 19 + + /* update state */ + add v8.2d, v8.2d, v0.2d + add v9.2d, v9.2d, v1.2d + add v10.2d, v10.2d, v2.2d + add v11.2d, v11.2d, v3.2d + + cond_yield 3f, x4, x5 + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +3: st1 {v8.2d-v11.2d}, [x0] + mov x0, x2 + ret +SYM_FUNC_END(__sha512_ce_transform) diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h new file mode 100644 index 000000000000..eae14f9752e0 --- /dev/null +++ b/lib/crypto/arm64/sha512.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm64-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); + +asmlinkage void sha512_blocks_arch(struct sha512_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_sha512_insns) && + likely(crypto_simd_usable())) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha512_ce_transform(state, data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA512_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + sha512_blocks_arch(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_have_named_feature(SHA512)) + static_branch_enable(&have_sha512_insns); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ -- cgit v1.2.3 From ecac3068ffc28324070e96c46d171b5431215c0d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:13 -0700 Subject: mips: cavium-octeon: Move octeon-crypto.h into asm directory Since arch/mips/cavium-octeon/crypto/octeon-crypto.h is now needed outside of its directory, move it to arch/mips/include/asm/octeon/crypto.h so that it can be included as . Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/octeon-crypto.c | 3 +- arch/mips/cavium-octeon/crypto/octeon-crypto.h | 224 ------------------------- arch/mips/cavium-octeon/crypto/octeon-md5.c | 3 +- arch/mips/cavium-octeon/crypto/octeon-sha1.c | 3 +- arch/mips/cavium-octeon/crypto/octeon-sha256.c | 3 +- arch/mips/cavium-octeon/crypto/octeon-sha512.c | 3 +- arch/mips/include/asm/octeon/crypto.h | 224 +++++++++++++++++++++++++ 7 files changed, 229 insertions(+), 234 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-crypto.h create mode 100644 arch/mips/include/asm/octeon/crypto.h diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/crypto/octeon-crypto.c index cfb4a146cf17..0ff8559391f5 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-crypto.c +++ b/arch/mips/cavium-octeon/crypto/octeon-crypto.c @@ -7,12 +7,11 @@ */ #include +#include #include #include #include -#include "octeon-crypto.h" - /** * Enable access to Octeon's COP2 crypto hardware for kernel use. Wrap any * crypto operations in calls to octeon_crypto_enable/disable in order to make diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.h b/arch/mips/cavium-octeon/crypto/octeon-crypto.h deleted file mode 100644 index cb68f9e284bb..000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-crypto.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012-2013 Cavium Inc., All Rights Reserved. - * - * MD5/SHA1/SHA256/SHA512 instruction definitions added by - * Aaro Koskinen . - * - */ -#ifndef __LINUX_OCTEON_CRYPTO_H -#define __LINUX_OCTEON_CRYPTO_H - -#include -#include - -#define OCTEON_CR_OPCODE_PRIORITY 300 - -extern unsigned long octeon_crypto_enable(struct octeon_cop2_state *state); -extern void octeon_crypto_disable(struct octeon_cop2_state *state, - unsigned long flags); - -/* - * Macros needed to implement MD5/SHA1/SHA256: - */ - -/* - * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). - */ -#define write_octeon_64bit_hash_dword(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0048+" STR(index) \ - : \ - : [rt] "d" (cpu_to_be64(value))); \ -} while (0) - -/* - * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). - */ -#define read_octeon_64bit_hash_dword(index) \ -({ \ - __be64 __value; \ - \ - __asm__ __volatile__ ( \ - "dmfc2 %[rt],0x0048+" STR(index) \ - : [rt] "=d" (__value) \ - : ); \ - \ - be64_to_cpu(__value); \ -}) - -/* - * The index can be 0-6. - */ -#define write_octeon_64bit_block_dword(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0040+" STR(index) \ - : \ - : [rt] "d" (cpu_to_be64(value))); \ -} while (0) - -/* - * The value is the final block dword (64-bit). - */ -#define octeon_md5_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x4047" \ - : \ - : [rt] "d" (cpu_to_be64(value))); \ -} while (0) - -/* - * The value is the final block dword (64-bit). - */ -#define octeon_sha1_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x4057" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The value is the final block dword (64-bit). - */ -#define octeon_sha256_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x404f" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * Macros needed to implement SHA512: - */ - -/* - * The index can be 0-7. - */ -#define write_octeon_64bit_hash_sha512(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0250+" STR(index) \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The index can be 0-7. - */ -#define read_octeon_64bit_hash_sha512(index) \ -({ \ - u64 __value; \ - \ - __asm__ __volatile__ ( \ - "dmfc2 %[rt],0x0250+" STR(index) \ - : [rt] "=d" (__value) \ - : ); \ - \ - __value; \ -}) - -/* - * The index can be 0-14. - */ -#define write_octeon_64bit_block_sha512(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0240+" STR(index) \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The value is the final block word (64-bit). - */ -#define octeon_sha512_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x424f" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The value is the final block dword (64-bit). - */ -#define octeon_sha1_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x4057" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The value is the final block dword (64-bit). - */ -#define octeon_sha256_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x404f" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * Macros needed to implement SHA512: - */ - -/* - * The index can be 0-7. - */ -#define write_octeon_64bit_hash_sha512(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0250+" STR(index) \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The index can be 0-7. - */ -#define read_octeon_64bit_hash_sha512(index) \ -({ \ - u64 __value; \ - \ - __asm__ __volatile__ ( \ - "dmfc2 %[rt],0x0250+" STR(index) \ - : [rt] "=d" (__value) \ - : ); \ - \ - __value; \ -}) - -/* - * The index can be 0-14. - */ -#define write_octeon_64bit_block_sha512(value, index) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x0240+" STR(index) \ - : \ - : [rt] "d" (value)); \ -} while (0) - -/* - * The value is the final block word (64-bit). - */ -#define octeon_sha512_start(value) \ -do { \ - __asm__ __volatile__ ( \ - "dmtc2 %[rt],0x424f" \ - : \ - : [rt] "d" (value)); \ -} while (0) - -#endif /* __LINUX_OCTEON_CRYPTO_H */ diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c index fbc84eb7fedf..a8ce831e2ceb 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-md5.c +++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c @@ -19,6 +19,7 @@ * any later version. */ +#include #include #include #include @@ -27,8 +28,6 @@ #include #include -#include "octeon-crypto.h" - struct octeon_md5_state { __le32 hash[MD5_HASH_WORDS]; u64 byte_count; diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c index e70f21a473da..e4a369a7764f 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c @@ -13,6 +13,7 @@ * Copyright (c) Jean-Francois Dive */ +#include #include #include #include @@ -21,8 +22,6 @@ #include #include -#include "octeon-crypto.h" - /* * We pass everything as 64-bit. OCTEON can handle misaligned data. */ diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c index f93faaf1f4af..c20038239cb6 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c @@ -12,13 +12,12 @@ * SHA224 Support Copyright 2007 Intel Corporation */ +#include #include #include #include #include -#include "octeon-crypto.h" - /* * We pass everything as 64-bit. OCTEON can handle misaligned data. */ diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c index 215311053db3..53de74f642db 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c @@ -13,6 +13,7 @@ * Copyright (c) 2003 Kyle McMartin */ +#include #include #include #include @@ -20,8 +21,6 @@ #include #include -#include "octeon-crypto.h" - /* * We pass everything as 64-bit. OCTEON can handle misaligned data. */ diff --git a/arch/mips/include/asm/octeon/crypto.h b/arch/mips/include/asm/octeon/crypto.h new file mode 100644 index 000000000000..cb68f9e284bb --- /dev/null +++ b/arch/mips/include/asm/octeon/crypto.h @@ -0,0 +1,224 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2012-2013 Cavium Inc., All Rights Reserved. + * + * MD5/SHA1/SHA256/SHA512 instruction definitions added by + * Aaro Koskinen . + * + */ +#ifndef __LINUX_OCTEON_CRYPTO_H +#define __LINUX_OCTEON_CRYPTO_H + +#include +#include + +#define OCTEON_CR_OPCODE_PRIORITY 300 + +extern unsigned long octeon_crypto_enable(struct octeon_cop2_state *state); +extern void octeon_crypto_disable(struct octeon_cop2_state *state, + unsigned long flags); + +/* + * Macros needed to implement MD5/SHA1/SHA256: + */ + +/* + * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). + */ +#define write_octeon_64bit_hash_dword(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0048+" STR(index) \ + : \ + : [rt] "d" (cpu_to_be64(value))); \ +} while (0) + +/* + * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). + */ +#define read_octeon_64bit_hash_dword(index) \ +({ \ + __be64 __value; \ + \ + __asm__ __volatile__ ( \ + "dmfc2 %[rt],0x0048+" STR(index) \ + : [rt] "=d" (__value) \ + : ); \ + \ + be64_to_cpu(__value); \ +}) + +/* + * The index can be 0-6. + */ +#define write_octeon_64bit_block_dword(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0040+" STR(index) \ + : \ + : [rt] "d" (cpu_to_be64(value))); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_md5_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x4047" \ + : \ + : [rt] "d" (cpu_to_be64(value))); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha1_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x4057" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha256_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x404f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * Macros needed to implement SHA512: + */ + +/* + * The index can be 0-7. + */ +#define write_octeon_64bit_hash_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0250+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The index can be 0-7. + */ +#define read_octeon_64bit_hash_sha512(index) \ +({ \ + u64 __value; \ + \ + __asm__ __volatile__ ( \ + "dmfc2 %[rt],0x0250+" STR(index) \ + : [rt] "=d" (__value) \ + : ); \ + \ + __value; \ +}) + +/* + * The index can be 0-14. + */ +#define write_octeon_64bit_block_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0240+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block word (64-bit). + */ +#define octeon_sha512_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x424f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha1_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x4057" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha256_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x404f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * Macros needed to implement SHA512: + */ + +/* + * The index can be 0-7. + */ +#define write_octeon_64bit_hash_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0250+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The index can be 0-7. + */ +#define read_octeon_64bit_hash_sha512(index) \ +({ \ + u64 __value; \ + \ + __asm__ __volatile__ ( \ + "dmfc2 %[rt],0x0250+" STR(index) \ + : [rt] "=d" (__value) \ + : ); \ + \ + __value; \ +}) + +/* + * The index can be 0-14. + */ +#define write_octeon_64bit_block_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0240+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block word (64-bit). + */ +#define octeon_sha512_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x424f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +#endif /* __LINUX_OCTEON_CRYPTO_H */ -- cgit v1.2.3 From 7117739ad2b40298776f428382ebce525adc7f7d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:14 -0700 Subject: lib/crypto: mips/sha512: Migrate optimized SHA-512 code to library Instead of exposing the mips-optimized SHA-512 code via mips-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be mips-optimized, and it fixes the longstanding issue where the mips-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/mips/cavium-octeon/crypto/octeon-sha512.c to lib/crypto/mips/sha512.h, view this commit with 'git show -M10'. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/Makefile | 1 - arch/mips/cavium-octeon/crypto/octeon-sha512.c | 166 ------------------------- arch/mips/configs/cavium_octeon_defconfig | 1 - arch/mips/crypto/Kconfig | 10 -- lib/crypto/Kconfig | 1 + lib/crypto/mips/sha512.h | 74 +++++++++++ 6 files changed, 75 insertions(+), 178 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-sha512.c create mode 100644 lib/crypto/mips/sha512.h diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile index db26c73fa0ed..168b19ef7ce8 100644 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ b/arch/mips/cavium-octeon/crypto/Makefile @@ -8,4 +8,3 @@ obj-y += octeon-crypto.o obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o obj-$(CONFIG_CRYPTO_SHA1_OCTEON) += octeon-sha1.o obj-$(CONFIG_CRYPTO_SHA256_OCTEON) += octeon-sha256.o -obj-$(CONFIG_CRYPTO_SHA512_OCTEON) += octeon-sha512.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c deleted file mode 100644 index 53de74f642db..000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * SHA-512 and SHA-384 Secure Hash Algorithm. - * - * Adapted for OCTEON by Aaro Koskinen . - * - * Based on crypto/sha512_generic.c, which is: - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2003 Kyle McMartin - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * We pass everything as 64-bit. OCTEON can handle misaligned data. - */ - -static void octeon_sha512_store_hash(struct sha512_state *sctx) -{ - write_octeon_64bit_hash_sha512(sctx->state[0], 0); - write_octeon_64bit_hash_sha512(sctx->state[1], 1); - write_octeon_64bit_hash_sha512(sctx->state[2], 2); - write_octeon_64bit_hash_sha512(sctx->state[3], 3); - write_octeon_64bit_hash_sha512(sctx->state[4], 4); - write_octeon_64bit_hash_sha512(sctx->state[5], 5); - write_octeon_64bit_hash_sha512(sctx->state[6], 6); - write_octeon_64bit_hash_sha512(sctx->state[7], 7); -} - -static void octeon_sha512_read_hash(struct sha512_state *sctx) -{ - sctx->state[0] = read_octeon_64bit_hash_sha512(0); - sctx->state[1] = read_octeon_64bit_hash_sha512(1); - sctx->state[2] = read_octeon_64bit_hash_sha512(2); - sctx->state[3] = read_octeon_64bit_hash_sha512(3); - sctx->state[4] = read_octeon_64bit_hash_sha512(4); - sctx->state[5] = read_octeon_64bit_hash_sha512(5); - sctx->state[6] = read_octeon_64bit_hash_sha512(6); - sctx->state[7] = read_octeon_64bit_hash_sha512(7); -} - -static void octeon_sha512_transform(struct sha512_state *sctx, - const u8 *src, int blocks) -{ - do { - const u64 *block = (const u64 *)src; - - write_octeon_64bit_block_sha512(block[0], 0); - write_octeon_64bit_block_sha512(block[1], 1); - write_octeon_64bit_block_sha512(block[2], 2); - write_octeon_64bit_block_sha512(block[3], 3); - write_octeon_64bit_block_sha512(block[4], 4); - write_octeon_64bit_block_sha512(block[5], 5); - write_octeon_64bit_block_sha512(block[6], 6); - write_octeon_64bit_block_sha512(block[7], 7); - write_octeon_64bit_block_sha512(block[8], 8); - write_octeon_64bit_block_sha512(block[9], 9); - write_octeon_64bit_block_sha512(block[10], 10); - write_octeon_64bit_block_sha512(block[11], 11); - write_octeon_64bit_block_sha512(block[12], 12); - write_octeon_64bit_block_sha512(block[13], 13); - write_octeon_64bit_block_sha512(block[14], 14); - octeon_sha512_start(block[15]); - - src += SHA512_BLOCK_SIZE; - } while (--blocks); -} - -static int octeon_sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - int remain; - - flags = octeon_crypto_enable(&state); - octeon_sha512_store_hash(sctx); - - remain = sha512_base_do_update_blocks(desc, data, len, - octeon_sha512_transform); - - octeon_sha512_read_hash(sctx); - octeon_crypto_disable(&state, flags); - return remain; -} - -static int octeon_sha512_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *hash) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - - flags = octeon_crypto_enable(&state); - octeon_sha512_store_hash(sctx); - - sha512_base_do_finup(desc, src, len, octeon_sha512_transform); - - octeon_sha512_read_hash(sctx); - octeon_crypto_disable(&state, flags); - return sha512_base_finish(desc, hash); -} - -static struct shash_alg octeon_sha512_algs[2] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = octeon_sha512_update, - .finup = octeon_sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name= "octeon-sha512", - .cra_priority = OCTEON_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = octeon_sha512_update, - .finup = octeon_sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name= "octeon-sha384", - .cra_priority = OCTEON_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init octeon_sha512_mod_init(void) -{ - if (!octeon_has_crypto()) - return -ENOTSUPP; - return crypto_register_shashes(octeon_sha512_algs, - ARRAY_SIZE(octeon_sha512_algs)); -} - -static void __exit octeon_sha512_mod_fini(void) -{ - crypto_unregister_shashes(octeon_sha512_algs, - ARRAY_SIZE(octeon_sha512_algs)); -} - -module_init(octeon_sha512_mod_init); -module_exit(octeon_sha512_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms (OCTEON)"); -MODULE_AUTHOR("Aaro Koskinen "); diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 88ae0aa85364..effdfb2bb738 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -157,7 +157,6 @@ CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MD5_OCTEON=y CONFIG_CRYPTO_SHA1_OCTEON=m -CONFIG_CRYPTO_SHA512_OCTEON=m CONFIG_CRYPTO_DES=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig index 6bf073ae7613..51a76a5ee3b1 100644 --- a/arch/mips/crypto/Kconfig +++ b/arch/mips/crypto/Kconfig @@ -22,14 +22,4 @@ config CRYPTO_SHA1_OCTEON Architecture: mips OCTEON -config CRYPTO_SHA512_OCTEON - tristate "Hash functions: SHA-384 and SHA-512 (OCTEON)" - depends on CPU_CAVIUM_OCTEON - select CRYPTO_SHA512 - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: mips OCTEON using crypto instructions, when available - endmenu diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 26413f679fab..303ea15e3e90 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -179,6 +179,7 @@ config CRYPTO_LIB_SHA512_ARCH depends on CRYPTO_LIB_SHA512 && !UML default y if ARM && !CPU_V7M default y if ARM64 + default y if MIPS && CPU_CAVIUM_OCTEON config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h new file mode 100644 index 000000000000..b3ffbc1e8ca8 --- /dev/null +++ b/lib/crypto/mips/sha512.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Cryptographic API. + * + * SHA-512 and SHA-384 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen . + * + * Based on crypto/sha512_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2003 Kyle McMartin + */ + +#include +#include + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha512_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_sha512(state->h[0], 0); + write_octeon_64bit_hash_sha512(state->h[1], 1); + write_octeon_64bit_hash_sha512(state->h[2], 2); + write_octeon_64bit_hash_sha512(state->h[3], 3); + write_octeon_64bit_hash_sha512(state->h[4], 4); + write_octeon_64bit_hash_sha512(state->h[5], 5); + write_octeon_64bit_hash_sha512(state->h[6], 6); + write_octeon_64bit_hash_sha512(state->h[7], 7); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_sha512(block[0], 0); + write_octeon_64bit_block_sha512(block[1], 1); + write_octeon_64bit_block_sha512(block[2], 2); + write_octeon_64bit_block_sha512(block[3], 3); + write_octeon_64bit_block_sha512(block[4], 4); + write_octeon_64bit_block_sha512(block[5], 5); + write_octeon_64bit_block_sha512(block[6], 6); + write_octeon_64bit_block_sha512(block[7], 7); + write_octeon_64bit_block_sha512(block[8], 8); + write_octeon_64bit_block_sha512(block[9], 9); + write_octeon_64bit_block_sha512(block[10], 10); + write_octeon_64bit_block_sha512(block[11], 11); + write_octeon_64bit_block_sha512(block[12], 12); + write_octeon_64bit_block_sha512(block[13], 13); + write_octeon_64bit_block_sha512(block[14], 14); + octeon_sha512_start(block[15]); + + data += SHA512_BLOCK_SIZE; + } while (--nblocks); + + state->h[0] = read_octeon_64bit_hash_sha512(0); + state->h[1] = read_octeon_64bit_hash_sha512(1); + state->h[2] = read_octeon_64bit_hash_sha512(2); + state->h[3] = read_octeon_64bit_hash_sha512(3); + state->h[4] = read_octeon_64bit_hash_sha512(4); + state->h[5] = read_octeon_64bit_hash_sha512(5); + state->h[6] = read_octeon_64bit_hash_sha512(6); + state->h[7] = read_octeon_64bit_hash_sha512(7); + octeon_crypto_disable(&cop2_state, flags); +} -- cgit v1.2.3 From b59059a22c5a717284b05385de5c0cbff2879859 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:15 -0700 Subject: lib/crypto: riscv/sha512: Migrate optimized SHA-512 code to library Instead of exposing the riscv-optimized SHA-512 code via riscv-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be riscv-optimized, and it fixes the longstanding issue where the riscv-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha512_blocks(), change the type of the nblocks parameter of the assembly function from int to size_t. The assembly function actually already treated it as size_t. Note: to see the diff from arch/riscv/crypto/sha512-riscv64-glue.c to lib/crypto/riscv/sha512.h, view this commit with 'git show -M10'. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/riscv/crypto/Kconfig | 12 -- arch/riscv/crypto/Makefile | 3 - arch/riscv/crypto/sha512-riscv64-glue.c | 130 ---------------- arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S | 203 ------------------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 2 + lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S | 203 +++++++++++++++++++++++++ lib/crypto/riscv/sha512.h | 41 +++++ 8 files changed, 247 insertions(+), 348 deletions(-) delete mode 100644 arch/riscv/crypto/sha512-riscv64-glue.c delete mode 100644 arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S create mode 100644 lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S create mode 100644 lib/crypto/riscv/sha512.h diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 53e4e1eacf55..a75d6325607b 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -28,18 +28,6 @@ config CRYPTO_GHASH_RISCV64 Architecture: riscv64 using: - Zvkg vector crypto extension -config CRYPTO_SHA512_RISCV64 - tristate "Hash functions: SHA-384 and SHA-512" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - select CRYPTO_LIB_SHA512 - select CRYPTO_SHA512 - help - SHA-384 and SHA-512 secure hash algorithm (FIPS 180) - - Architecture: riscv64 using: - - Zvknhb vector crypto extension - - Zvkb vector crypto extension - config CRYPTO_SM3_RISCV64 tristate "Hash functions: SM3 (ShangMi 3)" depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index e10e8257734e..183495a95cc0 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -7,9 +7,6 @@ aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \ obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o -obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o -sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o - obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c deleted file mode 100644 index b3dbc71de07b..000000000000 --- a/arch/riscv/crypto/sha512-riscv64-glue.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-512 and SHA-384 using the RISC-V vector crypto extensions - * - * Copyright (C) 2023 VRULL GmbH - * Author: Heiko Stuebner - * - * Copyright (C) 2023 SiFive, Inc. - * Author: Jerry Shih - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Note: the asm function only uses the 'state' field of struct sha512_state. - * It is assumed to be the first field. - */ -asmlinkage void sha512_transform_zvknhb_zvkb( - struct sha512_state *state, const u8 *data, int num_blocks); - -static void sha512_block(struct sha512_state *state, const u8 *data, - int num_blocks) -{ - /* - * Ensure struct sha512_state begins directly with the SHA-512 - * 512-bit internal state, as this is what the asm function expects. - */ - BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); - - if (crypto_simd_usable()) { - kernel_vector_begin(); - sha512_transform_zvknhb_zvkb(state, data, num_blocks); - kernel_vector_end(); - } else { - struct __sha512_ctx ctx = {}; - - static_assert(sizeof(ctx.state) == sizeof(state->state)); - memcpy(&ctx.state, state->state, sizeof(ctx.state)); - __sha512_update(&ctx, data, - (size_t)num_blocks * SHA512_BLOCK_SIZE); - memcpy(state->state, &ctx.state, sizeof(state->state)); - } -} - -static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, sha512_block); -} - -static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_block); - return sha512_base_finish(desc, out); -} - -static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_base_init(desc) ?: - riscv64_sha512_finup(desc, data, len, out); -} - -static struct shash_alg riscv64_sha512_algs[] = { - { - .init = sha512_base_init, - .update = riscv64_sha512_update, - .finup = riscv64_sha512_finup, - .digest = riscv64_sha512_digest, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA512_DIGEST_SIZE, - .base = { - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_name = "sha512", - .cra_driver_name = "sha512-riscv64-zvknhb-zvkb", - .cra_module = THIS_MODULE, - }, - }, { - .init = sha384_base_init, - .update = riscv64_sha512_update, - .finup = riscv64_sha512_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA384_DIGEST_SIZE, - .base = { - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_name = "sha384", - .cra_driver_name = "sha384-riscv64-zvknhb-zvkb", - .cra_module = THIS_MODULE, - }, - }, -}; - -static int __init riscv64_sha512_mod_init(void) -{ - if (riscv_isa_extension_available(NULL, ZVKNHB) && - riscv_isa_extension_available(NULL, ZVKB) && - riscv_vector_vlen() >= 128) - return crypto_register_shashes(riscv64_sha512_algs, - ARRAY_SIZE(riscv64_sha512_algs)); - - return -ENODEV; -} - -static void __exit riscv64_sha512_mod_exit(void) -{ - crypto_unregister_shashes(riscv64_sha512_algs, - ARRAY_SIZE(riscv64_sha512_algs)); -} - -module_init(riscv64_sha512_mod_init); -module_exit(riscv64_sha512_mod_exit); - -MODULE_DESCRIPTION("SHA-512 (RISC-V accelerated)"); -MODULE_AUTHOR("Heiko Stuebner "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha384"); diff --git a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S deleted file mode 100644 index 89f4a10d12dd..000000000000 --- a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S +++ /dev/null @@ -1,203 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ -// -// This file is dual-licensed, meaning that you can use it under your -// choice of either of the following two licenses: -// -// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the Apache License 2.0 (the "License"). You can obtain -// a copy in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html -// -// or -// -// Copyright (c) 2023, Christoph Müllner -// Copyright (c) 2023, Phoebe Chen -// Copyright 2024 Google LLC -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// The generated code of this file depends on the following RISC-V extensions: -// - RV64I -// - RISC-V Vector ('V') with VLEN >= 128 -// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb') -// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') - -#include - -.text -.option arch, +zvknhb, +zvkb - -#define STATEP a0 -#define DATA a1 -#define NUM_BLOCKS a2 - -#define STATEP_C a3 -#define K a4 - -#define MASK v0 -#define INDICES v1 -#define W0 v10 // LMUL=2 -#define W1 v12 // LMUL=2 -#define W2 v14 // LMUL=2 -#define W3 v16 // LMUL=2 -#define VTMP v20 // LMUL=2 -#define FEBA v22 // LMUL=2 -#define HGDC v24 // LMUL=2 -#define PREV_FEBA v26 // LMUL=2 -#define PREV_HGDC v28 // LMUL=2 - -// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words. -// -// If not all the message schedule words have been computed yet, then this also -// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 -// message schedule words; this macro computes the group after w3 and writes it -// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, -// w0), so the caller must cycle through the registers accordingly. -.macro sha512_4rounds last, w0, w1, w2, w3 - vle64.v VTMP, (K) - addi K, K, 32 - vadd.vv VTMP, VTMP, \w0 - vsha2cl.vv HGDC, FEBA, VTMP - vsha2ch.vv FEBA, HGDC, VTMP -.if !\last - vmerge.vvm VTMP, \w2, \w1, MASK - vsha2ms.vv \w0, VTMP, \w3 -.endif -.endm - -.macro sha512_16rounds last - sha512_4rounds \last, W0, W1, W2, W3 - sha512_4rounds \last, W1, W2, W3, W0 - sha512_4rounds \last, W2, W3, W0, W1 - sha512_4rounds \last, W3, W0, W1, W2 -.endm - -// void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data, -// int num_blocks); -SYM_FUNC_START(sha512_transform_zvknhb_zvkb) - - // Setup mask for the vmerge to replace the first word (idx==0) in - // message scheduling. There are 4 words, so an 8-bit mask suffices. - vsetivli zero, 1, e8, m1, ta, ma - vmv.v.i MASK, 0x01 - - // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we - // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype - // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0}, - // loaded using the 32-bit little endian value 0x00082028. - li t0, 0x00082028 - vsetivli zero, 1, e32, m1, ta, ma - vmv.v.x INDICES, t0 - addi STATEP_C, STATEP, 16 - vsetivli zero, 4, e64, m2, ta, ma - vluxei8.v FEBA, (STATEP), INDICES - vluxei8.v HGDC, (STATEP_C), INDICES - -.Lnext_block: - la K, K512 - addi NUM_BLOCKS, NUM_BLOCKS, -1 - - // Save the previous state, as it's needed later. - vmv.v.v PREV_FEBA, FEBA - vmv.v.v PREV_HGDC, HGDC - - // Load the next 1024-bit message block and endian-swap each 64-bit word - vle64.v W0, (DATA) - vrev8.v W0, W0 - addi DATA, DATA, 32 - vle64.v W1, (DATA) - vrev8.v W1, W1 - addi DATA, DATA, 32 - vle64.v W2, (DATA) - vrev8.v W2, W2 - addi DATA, DATA, 32 - vle64.v W3, (DATA) - vrev8.v W3, W3 - addi DATA, DATA, 32 - - // Do the 80 rounds of SHA-512. - sha512_16rounds 0 - sha512_16rounds 0 - sha512_16rounds 0 - sha512_16rounds 0 - sha512_16rounds 1 - - // Add the previous state. - vadd.vv FEBA, FEBA, PREV_FEBA - vadd.vv HGDC, HGDC, PREV_HGDC - - // Repeat if more blocks remain. - bnez NUM_BLOCKS, .Lnext_block - - // Store the new state and return. - vsuxei8.v FEBA, (STATEP), INDICES - vsuxei8.v HGDC, (STATEP_C), INDICES - ret -SYM_FUNC_END(sha512_transform_zvknhb_zvkb) - -.section ".rodata" -.p2align 3 -.type K512, @object -K512: - .dword 0x428a2f98d728ae22, 0x7137449123ef65cd - .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc - .dword 0x3956c25bf348b538, 0x59f111f1b605d019 - .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 - .dword 0xd807aa98a3030242, 0x12835b0145706fbe - .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 - .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 - .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 - .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 - .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 - .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 - .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 - .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 - .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 - .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 - .dword 0x06ca6351e003826f, 0x142929670a0e6e70 - .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 - .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df - .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 - .dword 0x81c2c92e47edaee6, 0x92722c851482353b - .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 - .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 - .dword 0xd192e819d6ef5218, 0xd69906245565a910 - .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 - .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 - .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 - .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb - .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 - .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 - .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec - .dword 0x90befffa23631e28, 0xa4506cebde82bde9 - .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b - .dword 0xca273eceea26619c, 0xd186b8c721c0c207 - .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 - .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 - .dword 0x113f9804bef90dae, 0x1b710b35131c471b - .dword 0x28db77f523047d84, 0x32caab7b40c72493 - .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c - .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a - .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 -.size K512, . - K512 diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 303ea15e3e90..656f2c08c46c 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -180,6 +180,7 @@ config CRYPTO_LIB_SHA512_ARCH default y if ARM && !CPU_V7M default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 22269ab06d70..bc42464a279b 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -92,6 +92,8 @@ $(obj)/arm64/sha512-core.S: $(src)/../../arch/arm64/lib/crypto/sha2-armv8.pl clean-files += arm64/sha512-core.S libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o endif + +libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S new file mode 100644 index 000000000000..b41eebf60546 --- /dev/null +++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvknhb, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 +#define K a4 + +#define MASK v0 +#define INDICES v1 +#define W0 v10 // LMUL=2 +#define W1 v12 // LMUL=2 +#define W2 v14 // LMUL=2 +#define W3 v16 // LMUL=2 +#define VTMP v20 // LMUL=2 +#define FEBA v22 // LMUL=2 +#define HGDC v24 // LMUL=2 +#define PREV_FEBA v26 // LMUL=2 +#define PREV_HGDC v28 // LMUL=2 + +// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha512_4rounds last, w0, w1, w2, w3 + vle64.v VTMP, (K) + addi K, K, 32 + vadd.vv VTMP, VTMP, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha512_16rounds last + sha512_4rounds \last, W0, W1, W2, W3 + sha512_4rounds \last, W1, W2, W3, W0 + sha512_4rounds \last, W2, W3, W0, W1 + sha512_4rounds \last, W3, W0, W1, W2 +.endm + +// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, +// const u8 *data, size_t nblocks); +SYM_FUNC_START(sha512_transform_zvknhb_zvkb) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype + // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0}, + // loaded using the 32-bit little endian value 0x00082028. + li t0, 0x00082028 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 16 + vsetivli zero, 4, e64, m2, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + la K, K512 + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 1024-bit message block and endian-swap each 64-bit word + vle64.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 32 + vle64.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 32 + vle64.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 32 + vle64.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 32 + + // Do the 80 rounds of SHA-512. + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 1 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha512_transform_zvknhb_zvkb) + +.section ".rodata" +.p2align 3 +.type K512, @object +K512: + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +.size K512, . - K512 diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h new file mode 100644 index 000000000000..9d0abede322f --- /dev/null +++ b/lib/crypto/riscv/sha512.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-512 and SHA-384 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); + +asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_extensions) && + likely(crypto_simd_usable())) { + kernel_vector_begin(); + sha512_transform_zvknhb_zvkb(state, data, nblocks); + kernel_vector_end(); + } else { + sha512_blocks_generic(state, data, nblocks); + } +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (riscv_isa_extension_available(NULL, ZVKNHB) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&have_extensions); +} -- cgit v1.2.3 From b7b366087e0f1645f867077a14bab617516d0f57 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:16 -0700 Subject: lib/crypto: s390/sha512: Migrate optimized SHA-512 code to library Instead of exposing the s390-optimized SHA-512 code via s390-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be s390-optimized, and it fixes the longstanding issue where the s390-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-13-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/crypto/Kconfig | 10 --- arch/s390/crypto/Makefile | 1 - arch/s390/crypto/sha512_s390.c | 151 -------------------------------------- lib/crypto/Kconfig | 1 + lib/crypto/s390/sha512.h | 28 +++++++ 7 files changed, 29 insertions(+), 164 deletions(-) delete mode 100644 arch/s390/crypto/sha512_s390.c create mode 100644 lib/crypto/s390/sha512.h diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 8ecad727497e..ef313c30b375 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -804,7 +804,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c13a77765162..b6fa341bb03b 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -791,7 +791,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig index e2c27588b21a..4557514fbac3 100644 --- a/arch/s390/crypto/Kconfig +++ b/arch/s390/crypto/Kconfig @@ -2,16 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (s390)" -config CRYPTO_SHA512_S390 - tristate "Hash functions: SHA-384 and SHA-512" - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: s390 - - It is available as of z10. - config CRYPTO_SHA1_S390 tristate "Hash functions: SHA-1" select CRYPTO_HASH diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 21757d86cd49..473d64c0982a 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -4,7 +4,6 @@ # obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c deleted file mode 100644 index e8bb172dbed7..000000000000 --- a/arch/s390/crypto/sha512_s390.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA512 and SHA38 Secure Hash Algorithm. - * - * Copyright IBM Corp. 2007 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#include -#include -#include -#include -#include -#include -#include - -#include "sha.h" - -static int sha512_init_s390(struct shash_desc *desc) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - - ctx->sha512.state[0] = SHA512_H0; - ctx->sha512.state[1] = SHA512_H1; - ctx->sha512.state[2] = SHA512_H2; - ctx->sha512.state[3] = SHA512_H3; - ctx->sha512.state[4] = SHA512_H4; - ctx->sha512.state[5] = SHA512_H5; - ctx->sha512.state[6] = SHA512_H6; - ctx->sha512.state[7] = SHA512_H7; - ctx->count = 0; - ctx->sha512.count_hi = 0; - ctx->func = CPACF_KIMD_SHA_512; - - return 0; -} - -static int sha512_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha512_state *octx = out; - - octx->count[0] = sctx->count; - octx->count[1] = sctx->sha512.count_hi; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - return 0; -} - -static int sha512_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha512_state *ictx = in; - - sctx->count = ictx->count[0]; - sctx->sha512.count_hi = ictx->count[1]; - - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - sctx->func = CPACF_KIMD_SHA_512; - return 0; -} - -static struct shash_alg sha512_alg = { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_init_s390, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha512_export, - .import = sha512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name= "sha512-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha512"); - -static int sha384_init_s390(struct shash_desc *desc) -{ - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - - ctx->sha512.state[0] = SHA384_H0; - ctx->sha512.state[1] = SHA384_H1; - ctx->sha512.state[2] = SHA384_H2; - ctx->sha512.state[3] = SHA384_H3; - ctx->sha512.state[4] = SHA384_H4; - ctx->sha512.state[5] = SHA384_H5; - ctx->sha512.state[6] = SHA384_H6; - ctx->sha512.state[7] = SHA384_H7; - ctx->count = 0; - ctx->sha512.count_hi = 0; - ctx->func = CPACF_KIMD_SHA_512; - - return 0; -} - -static struct shash_alg sha384_alg = { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_init_s390, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = sha512_export, - .import = sha512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name= "sha384-s390", - .cra_priority = 300, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_ctxsize = sizeof(struct s390_sha_ctx), - .cra_module = THIS_MODULE, - } -}; - -MODULE_ALIAS_CRYPTO("sha384"); - -static int __init init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) - return -ENODEV; - if ((ret = crypto_register_shash(&sha512_alg)) < 0) - goto out; - if ((ret = crypto_register_shash(&sha384_alg)) < 0) - crypto_unregister_shash(&sha512_alg); -out: - return ret; -} - -static void __exit fini(void) -{ - crypto_unregister_shash(&sha512_alg); - crypto_unregister_shash(&sha384_alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA512 and SHA-384 Secure Hash Algorithm"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 656f2c08c46c..6f1386b8d79d 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -181,6 +181,7 @@ config CRYPTO_LIB_SHA512_ARCH default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h new file mode 100644 index 000000000000..24744651550c --- /dev/null +++ b/lib/crypto/s390/sha512.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha512)) + cpacf_kimd(CPACF_KIMD_SHA_512, state, data, + nblocks * SHA512_BLOCK_SIZE); + else + sha512_blocks_generic(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) + static_branch_enable(&have_cpacf_sha512); +} -- cgit v1.2.3 From 02b35bab7e6c5bb2a843828316d528216b8cedc8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:17 -0700 Subject: lib/crypto: sparc/sha512: Migrate optimized SHA-512 code to library Instead of exposing the sparc-optimized SHA-512 code via sparc-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be sparc-optimized, and it fixes the longstanding issue where the sparc-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha512_blocks(), change the type of the nblocks parameter of the assembly function from int to size_t. The assembly function actually already treated it as size_t. Note: to see the diff from arch/sparc/crypto/sha512_glue.c to lib/crypto/sparc/sha512.h, view this commit with 'git show -M10'. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-14-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/sparc/crypto/Kconfig | 10 ---- arch/sparc/crypto/Makefile | 2 - arch/sparc/crypto/sha512_asm.S | 102 --------------------------------- arch/sparc/crypto/sha512_glue.c | 122 ---------------------------------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + lib/crypto/sparc/sha512.h | 42 ++++++++++++++ lib/crypto/sparc/sha512_asm.S | 102 +++++++++++++++++++++++++++++++++ 8 files changed, 146 insertions(+), 236 deletions(-) delete mode 100644 arch/sparc/crypto/sha512_asm.S delete mode 100644 arch/sparc/crypto/sha512_glue.c create mode 100644 lib/crypto/sparc/sha512.h create mode 100644 lib/crypto/sparc/sha512_asm.S diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig index a6ba319c42dc..9d8da9aef3a4 100644 --- a/arch/sparc/crypto/Kconfig +++ b/arch/sparc/crypto/Kconfig @@ -36,16 +36,6 @@ config CRYPTO_SHA1_SPARC64 Architecture: sparc64 -config CRYPTO_SHA512_SPARC64 - tristate "Hash functions: SHA-384 and SHA-512" - depends on SPARC64 - select CRYPTO_SHA512 - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: sparc64 using crypto instructions, when available - config CRYPTO_AES_SPARC64 tristate "Ciphers: AES, modes: ECB, CBC, CTR" depends on SPARC64 diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 701c39edb0d7..99a7e8fd13bc 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -4,7 +4,6 @@ # obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o -obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o @@ -12,7 +11,6 @@ obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o sha1-sparc64-y := sha1_asm.o sha1_glue.o -sha512-sparc64-y := sha512_asm.o sha512_glue.o md5-sparc64-y := md5_asm.o md5_glue.o aes-sparc64-y := aes_asm.o aes_glue.o diff --git a/arch/sparc/crypto/sha512_asm.S b/arch/sparc/crypto/sha512_asm.S deleted file mode 100644 index 9932b4fe1b59..000000000000 --- a/arch/sparc/crypto/sha512_asm.S +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include - -ENTRY(sha512_sparc64_transform) - /* %o0 = digest, %o1 = data, %o2 = rounds */ - VISEntry - ldd [%o0 + 0x00], %f0 - ldd [%o0 + 0x08], %f2 - ldd [%o0 + 0x10], %f4 - ldd [%o0 + 0x18], %f6 - ldd [%o0 + 0x20], %f8 - ldd [%o0 + 0x28], %f10 - andcc %o1, 0x7, %g0 - ldd [%o0 + 0x30], %f12 - bne,pn %xcc, 10f - ldd [%o0 + 0x38], %f14 - -1: - ldd [%o1 + 0x00], %f16 - ldd [%o1 + 0x08], %f18 - ldd [%o1 + 0x10], %f20 - ldd [%o1 + 0x18], %f22 - ldd [%o1 + 0x20], %f24 - ldd [%o1 + 0x28], %f26 - ldd [%o1 + 0x30], %f28 - ldd [%o1 + 0x38], %f30 - ldd [%o1 + 0x40], %f32 - ldd [%o1 + 0x48], %f34 - ldd [%o1 + 0x50], %f36 - ldd [%o1 + 0x58], %f38 - ldd [%o1 + 0x60], %f40 - ldd [%o1 + 0x68], %f42 - ldd [%o1 + 0x70], %f44 - ldd [%o1 + 0x78], %f46 - - SHA512 - - subcc %o2, 1, %o2 - bne,pt %xcc, 1b - add %o1, 0x80, %o1 - -5: - std %f0, [%o0 + 0x00] - std %f2, [%o0 + 0x08] - std %f4, [%o0 + 0x10] - std %f6, [%o0 + 0x18] - std %f8, [%o0 + 0x20] - std %f10, [%o0 + 0x28] - std %f12, [%o0 + 0x30] - std %f14, [%o0 + 0x38] - retl - VISExit -10: - alignaddr %o1, %g0, %o1 - - ldd [%o1 + 0x00], %f18 -1: - ldd [%o1 + 0x08], %f20 - ldd [%o1 + 0x10], %f22 - ldd [%o1 + 0x18], %f24 - ldd [%o1 + 0x20], %f26 - ldd [%o1 + 0x28], %f28 - ldd [%o1 + 0x30], %f30 - ldd [%o1 + 0x38], %f32 - ldd [%o1 + 0x40], %f34 - ldd [%o1 + 0x48], %f36 - ldd [%o1 + 0x50], %f38 - ldd [%o1 + 0x58], %f40 - ldd [%o1 + 0x60], %f42 - ldd [%o1 + 0x68], %f44 - ldd [%o1 + 0x70], %f46 - ldd [%o1 + 0x78], %f48 - ldd [%o1 + 0x80], %f50 - - faligndata %f18, %f20, %f16 - faligndata %f20, %f22, %f18 - faligndata %f22, %f24, %f20 - faligndata %f24, %f26, %f22 - faligndata %f26, %f28, %f24 - faligndata %f28, %f30, %f26 - faligndata %f30, %f32, %f28 - faligndata %f32, %f34, %f30 - faligndata %f34, %f36, %f32 - faligndata %f36, %f38, %f34 - faligndata %f38, %f40, %f36 - faligndata %f40, %f42, %f38 - faligndata %f42, %f44, %f40 - faligndata %f44, %f46, %f42 - faligndata %f46, %f48, %f44 - faligndata %f48, %f50, %f46 - - SHA512 - - subcc %o2, 1, %o2 - fsrc2 %f50, %f18 - bne,pt %xcc, 1b - add %o1, 0x80, %o1 - - ba,a,pt %xcc, 5b -ENDPROC(sha512_sparc64_transform) diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c deleted file mode 100644 index fb81c3290c8c..000000000000 --- a/arch/sparc/crypto/sha512_glue.c +++ /dev/null @@ -1,122 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Glue code for SHA512 hashing optimized for sparc64 crypto opcodes. - * - * This is based largely upon crypto/sha512_generic.c - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2003 Kyle McMartin - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data, - unsigned int rounds); - -static void sha512_block(struct sha512_state *sctx, const u8 *src, int blocks) -{ - sha512_sparc64_transform(sctx->state, src, blocks); -} - -static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, sha512_block); -} - -static int sha512_sparc64_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, src, len, sha512_block); - return sha512_base_finish(desc, out); -} - -static struct shash_alg sha512_alg = { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_sparc64_update, - .finup = sha512_sparc64_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name= "sha512-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static struct shash_alg sha384_alg = { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_sparc64_update, - .finup = sha512_sparc64_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name= "sha384-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool __init sparc64_has_sha512_opcode(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return false; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_SHA512)) - return false; - - return true; -} - -static int __init sha512_sparc64_mod_init(void) -{ - if (sparc64_has_sha512_opcode()) { - int ret = crypto_register_shash(&sha384_alg); - if (ret < 0) - return ret; - - ret = crypto_register_shash(&sha512_alg); - if (ret < 0) { - crypto_unregister_shash(&sha384_alg); - return ret; - } - - pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n"); - return 0; - } - pr_info("sparc64 sha512 opcode not available.\n"); - return -ENODEV; -} - -static void __exit sha512_sparc64_mod_fini(void) -{ - crypto_unregister_shash(&sha384_alg); - crypto_unregister_shash(&sha512_alg); -} - -module_init(sha512_sparc64_mod_init); -module_exit(sha512_sparc64_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-384 and SHA-512 Secure Hash Algorithm, sparc64 sha512 opcode accelerated"); - -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); - -#include "crop_devid.c" diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 6f1386b8d79d..c3edb78484f0 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -182,6 +182,7 @@ config CRYPTO_LIB_SHA512_ARCH default y if MIPS && CPU_CAVIUM_OCTEON default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO default y if S390 + default y if SPARC64 config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index bc42464a279b..0fb93f2469b3 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -94,6 +94,7 @@ libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o endif libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o +libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h new file mode 100644 index 000000000000..55303ab6b15f --- /dev/null +++ b/lib/crypto/sparc/sha512.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-512 accelerated using the sparc64 sha512 opcodes + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2003 Kyle McMartin + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes); + +asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha512_opcodes)) + sha512_sparc64_transform(state, data, nblocks); + else + sha512_blocks_generic(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA512)) + return; + + static_branch_enable(&have_sha512_opcodes); + pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n"); +} diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S new file mode 100644 index 000000000000..9932b4fe1b59 --- /dev/null +++ b/lib/crypto/sparc/sha512_asm.S @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +ENTRY(sha512_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntry + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + andcc %o1, 0x7, %g0 + ldd [%o0 + 0x30], %f12 + bne,pn %xcc, 10f + ldd [%o0 + 0x38], %f14 + +1: + ldd [%o1 + 0x00], %f16 + ldd [%o1 + 0x08], %f18 + ldd [%o1 + 0x10], %f20 + ldd [%o1 + 0x18], %f22 + ldd [%o1 + 0x20], %f24 + ldd [%o1 + 0x28], %f26 + ldd [%o1 + 0x30], %f28 + ldd [%o1 + 0x38], %f30 + ldd [%o1 + 0x40], %f32 + ldd [%o1 + 0x48], %f34 + ldd [%o1 + 0x50], %f36 + ldd [%o1 + 0x58], %f38 + ldd [%o1 + 0x60], %f40 + ldd [%o1 + 0x68], %f42 + ldd [%o1 + 0x70], %f44 + ldd [%o1 + 0x78], %f46 + + SHA512 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + +5: + std %f0, [%o0 + 0x00] + std %f2, [%o0 + 0x08] + std %f4, [%o0 + 0x10] + std %f6, [%o0 + 0x18] + std %f8, [%o0 + 0x20] + std %f10, [%o0 + 0x28] + std %f12, [%o0 + 0x30] + std %f14, [%o0 + 0x38] + retl + VISExit +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f18 +1: + ldd [%o1 + 0x08], %f20 + ldd [%o1 + 0x10], %f22 + ldd [%o1 + 0x18], %f24 + ldd [%o1 + 0x20], %f26 + ldd [%o1 + 0x28], %f28 + ldd [%o1 + 0x30], %f30 + ldd [%o1 + 0x38], %f32 + ldd [%o1 + 0x40], %f34 + ldd [%o1 + 0x48], %f36 + ldd [%o1 + 0x50], %f38 + ldd [%o1 + 0x58], %f40 + ldd [%o1 + 0x60], %f42 + ldd [%o1 + 0x68], %f44 + ldd [%o1 + 0x70], %f46 + ldd [%o1 + 0x78], %f48 + ldd [%o1 + 0x80], %f50 + + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + faligndata %f26, %f28, %f24 + faligndata %f28, %f30, %f26 + faligndata %f30, %f32, %f28 + faligndata %f32, %f34, %f30 + faligndata %f34, %f36, %f32 + faligndata %f36, %f38, %f34 + faligndata %f38, %f40, %f36 + faligndata %f40, %f42, %f38 + faligndata %f42, %f44, %f40 + faligndata %f44, %f46, %f42 + faligndata %f46, %f48, %f44 + faligndata %f48, %f50, %f46 + + SHA512 + + subcc %o2, 1, %o2 + fsrc2 %f50, %f18 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha512_sparc64_transform) -- cgit v1.2.3 From 484c18119f4fbc6bca7c41e64b6fa84133e1057b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:18 -0700 Subject: lib/crypto: x86/sha512: Migrate optimized SHA-512 code to library Instead of exposing the x86-optimized SHA-512 code via x86-specific crypto_shash algorithms, instead just implement the sha512_blocks() library function. This is much simpler, it makes the SHA-512 (and SHA-384) library functions be x86-optimized, and it fixes the longstanding issue where the x86-optimized SHA-512 code was disabled by default. SHA-512 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha512_blocks(), change the type of the nblocks parameter of the assembly functions from int to size_t. The assembly functions actually already treated it as size_t. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-15-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/crypto/Kconfig | 13 - arch/x86/crypto/Makefile | 3 - arch/x86/crypto/sha512-avx-asm.S | 423 -------------------- arch/x86/crypto/sha512-avx2-asm.S | 750 ----------------------------------- arch/x86/crypto/sha512-ssse3-asm.S | 425 -------------------- arch/x86/crypto/sha512_ssse3_glue.c | 322 ---------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 3 + lib/crypto/x86/sha512-avx-asm.S | 424 ++++++++++++++++++++ lib/crypto/x86/sha512-avx2-asm.S | 751 ++++++++++++++++++++++++++++++++++++ lib/crypto/x86/sha512-ssse3-asm.S | 423 ++++++++++++++++++++ lib/crypto/x86/sha512.h | 54 +++ 12 files changed, 1656 insertions(+), 1936 deletions(-) delete mode 100644 arch/x86/crypto/sha512-avx-asm.S delete mode 100644 arch/x86/crypto/sha512-avx2-asm.S delete mode 100644 arch/x86/crypto/sha512-ssse3-asm.S delete mode 100644 arch/x86/crypto/sha512_ssse3_glue.c create mode 100644 lib/crypto/x86/sha512-avx-asm.S create mode 100644 lib/crypto/x86/sha512-avx2-asm.S create mode 100644 lib/crypto/x86/sha512-ssse3-asm.S create mode 100644 lib/crypto/x86/sha512.h diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 56cfdc79e2c6..eb641a300154 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -390,19 +390,6 @@ config CRYPTO_SHA1_SSSE3 - AVX2 (Advanced Vector Extensions 2) - SHA-NI (SHA Extensions New Instructions) -config CRYPTO_SHA512_SSSE3 - tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" - depends on 64BIT - select CRYPTO_SHA512 - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index aa289a9e0153..d31348be8370 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -54,9 +54,6 @@ endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o -obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o -sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S deleted file mode 100644 index 5bfce4b045fd..000000000000 --- a/arch/x86/crypto/sha512-avx-asm.S +++ /dev/null @@ -1,423 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with AVX instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# David Cote -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include -#include - -.text - -# Virtual Registers -# ARG1 -digest = %rdi -# ARG2 -msg = %rsi -# ARG3 -msglen = %rdx -T1 = %rcx -T2 = %r8 -a_64 = %r9 -b_64 = %r10 -c_64 = %r11 -d_64 = %r12 -e_64 = %r13 -f_64 = %r14 -g_64 = %r15 -h_64 = %rbx -tmp0 = %rax - -# Local variables (stack frame) - -# Message Schedule -W_SIZE = 80*8 -# W[t] + K[t] | W[t+1] + K[t+1] -WK_SIZE = 2*8 - -frame_W = 0 -frame_WK = frame_W + W_SIZE -frame_size = frame_WK + WK_SIZE - -# Useful QWORD "arrays" for simpler memory references -# MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even - -# Input message (arg1) -#define MSG(i) 8*i(msg) - -# Output Digest (arg2) -#define DIGEST(i) 8*i(digest) - -# SHA Constants (static mem) -#define K_t(i) 8*i+K512(%rip) - -# Message Schedule (stack frame) -#define W_t(i) 8*i+frame_W(%rsp) - -# W[t]+K[t] (stack frame) -#define WK_2(i) 8*((i%2))+frame_WK(%rsp) - -.macro RotateState - # Rotate symbols a..h right - TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = TMP -.endm - -.macro RORQ p1 p2 - # shld is faster than ror on Sandybridge - shld $(64-\p2), \p1, \p1 -.endm - -.macro SHA512_Round rnd - # Compute Round %%t - mov f_64, T1 # T1 = f - mov e_64, tmp0 # tmp = e - xor g_64, T1 # T1 = f ^ g - RORQ tmp0, 23 # 41 # tmp = e ror 23 - and e_64, T1 # T1 = (f ^ g) & e - xor e_64, tmp0 # tmp = (e ror 23) ^ e - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) - idx = \rnd - add WK_2(idx), T1 # W[t] + K[t] from message scheduler - RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e - mov a_64, T2 # T2 = a - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h - RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) - mov a_64, tmp0 # tmp = a - xor c_64, T2 # T2 = a ^ c - and c_64, tmp0 # tmp = a & c - and b_64, T2 # T2 = (a ^ c) & b - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) - mov a_64, tmp0 # tmp = a - RORQ tmp0, 5 # 39 # tmp = a ror 5 - xor a_64, tmp0 # tmp = (a ror 5) ^ a - add T1, d_64 # e(next_state) = d + T1 - RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) - RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) - RotateState -.endm - -.macro SHA512_2Sched_2Round_avx rnd - # Compute rounds t-2 and t-1 - # Compute message schedule QWORDS t and t+1 - - # Two rounds are computed based on the values for K[t-2]+W[t-2] and - # K[t-1]+W[t-1] which were previously stored at WK_2 by the message - # scheduler. - # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. - # They are then added to their respective SHA512 constants at - # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] - # For brievity, the comments following vectored instructions only refer to - # the first of a pair of QWORDS. - # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} - # The computation of the message schedule and the rounds are tightly - # stitched to take advantage of instruction-level parallelism. - - idx = \rnd - 2 - vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] - idx = \rnd - 15 - vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] - mov f_64, T1 - vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 - mov e_64, tmp0 - vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 - xor g_64, T1 - RORQ tmp0, 23 # 41 - vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 - and e_64, T1 - xor e_64, tmp0 - vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 - xor g_64, T1 - idx = \rnd - add WK_2(idx), T1# - vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 - RORQ tmp0, 4 # 18 - vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 - xor e_64, tmp0 - mov a_64, T2 - add h_64, T1 - vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 - RORQ tmp0, 14 # 14 - add tmp0, T1 - vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 - mov a_64, tmp0 - xor c_64, T2 - vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 - and c_64, tmp0 - and b_64, T2 - vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 - xor tmp0, T2 - mov a_64, tmp0 - vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 - RORQ tmp0, 5 # 39 - vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 - xor a_64, tmp0 - add T1, d_64 - RORQ tmp0, 6 # 34 - xor a_64, tmp0 - vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ - # W[t-15]>>7 ^ W[t-15]<<63 - lea (T1, T2), h_64 - RORQ tmp0, 28 # 28 - vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 - add tmp0, h_64 - RotateState - vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ - # W[t-2]<<25 - mov f_64, T1 - vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) - mov e_64, tmp0 - xor g_64, T1 - idx = \rnd - 16 - vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] - idx = \rnd - 7 - vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] - RORQ tmp0, 23 # 41 - and e_64, T1 - xor e_64, tmp0 - xor g_64, T1 - vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 - idx = \rnd + 1 - add WK_2(idx), T1 - vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) - RORQ tmp0, 4 # 18 - vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) - xor e_64, tmp0 - vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + - # s0(W[t-15]) + W[t-16] - mov a_64, T2 - add h_64, T1 - RORQ tmp0, 14 # 14 - add tmp0, T1 - idx = \rnd - vmovdqa %xmm0, W_t(idx) # Store W[t] - vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] - vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds - mov a_64, tmp0 - xor c_64, T2 - and c_64, tmp0 - and b_64, T2 - xor tmp0, T2 - mov a_64, tmp0 - RORQ tmp0, 5 # 39 - xor a_64, tmp0 - add T1, d_64 - RORQ tmp0, 6 # 34 - xor a_64, tmp0 - lea (T1, T2), h_64 - RORQ tmp0, 28 # 28 - add tmp0, h_64 - RotateState -.endm - -######################################################################## -# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_avx) - test msglen, msglen - je .Lnowork - - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - -.Lupdateblock: - - # Load state variables - mov DIGEST(0), a_64 - mov DIGEST(1), b_64 - mov DIGEST(2), c_64 - mov DIGEST(3), d_64 - mov DIGEST(4), e_64 - mov DIGEST(5), f_64 - mov DIGEST(6), g_64 - mov DIGEST(7), h_64 - - t = 0 - .rept 80/2 + 1 - # (80 rounds) / (2 rounds/iteration) + (1 iteration) - # +1 iteration because the scheduler leads hashing by 1 iteration - .if t < 2 - # BSWAP 2 QWORDS - vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 - vmovdqu MSG(t), %xmm0 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] - vmovdqa %xmm0, WK_2(t) # Store into WK for rounds - .elseif t < 16 - # BSWAP 2 QWORDS# Compute 2 Rounds - vmovdqu MSG(t), %xmm0 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP - SHA512_Round t-2 # Round t-2 - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] - SHA512_Round t-1 # Round t-1 - vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK - .elseif t < 79 - # Schedule 2 QWORDS# Compute 2 Rounds - SHA512_2Sched_2Round_avx t - .else - # Compute 2 Rounds - SHA512_Round t-2 - SHA512_Round t-1 - .endif - t = t+2 - .endr - - # Update digest - add a_64, DIGEST(0) - add b_64, DIGEST(1) - add c_64, DIGEST(2) - add d_64, DIGEST(3) - add e_64, DIGEST(4) - add f_64, DIGEST(5) - add g_64, DIGEST(6) - add h_64, DIGEST(7) - - # Advance to next message block - add $16*8, msg - dec msglen - jnz .Lupdateblock - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - -.Lnowork: - RET -SYM_FUNC_END(sha512_transform_avx) - -######################################################################## -### Binary Data - -.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 -.align 16 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -XMM_QWORD_BSWAP: - .octa 0x08090a0b0c0d0e0f0001020304050607 - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S deleted file mode 100644 index 24973f42c43f..000000000000 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ /dev/null @@ -1,750 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# David Cote -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 blocks at a time, with 4 lanes per block -######################################################################## - -#include -#include - -.text - -# Virtual Registers -Y_0 = %ymm4 -Y_1 = %ymm5 -Y_2 = %ymm6 -Y_3 = %ymm7 - -YTMP0 = %ymm0 -YTMP1 = %ymm1 -YTMP2 = %ymm2 -YTMP3 = %ymm3 -YTMP4 = %ymm8 -XFER = YTMP0 - -BYTE_FLIP_MASK = %ymm9 - -# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 -CTX1 = %rdi -CTX2 = %r12 -# 2nd arg -INP = %rsi -# 3rd arg -NUM_BLKS = %rdx - -c = %rcx -d = %r8 -e = %rdx -y3 = %rsi - -TBL = %rdi # clobbers CTX1 - -a = %rax -b = %rbx - -f = %r9 -g = %r10 -h = %r11 -old_h = %r11 - -T1 = %r12 # clobbers CTX2 -y0 = %r13 -y1 = %r14 -y2 = %r15 - -# Local variables (stack frame) -XFER_SIZE = 4*8 -SRND_SIZE = 1*8 -INP_SIZE = 1*8 -INPEND_SIZE = 1*8 -CTX_SIZE = 1*8 - -frame_XFER = 0 -frame_SRND = frame_XFER + XFER_SIZE -frame_INP = frame_SRND + SRND_SIZE -frame_INPEND = frame_INP + INP_SIZE -frame_CTX = frame_INPEND + INPEND_SIZE -frame_size = frame_CTX + CTX_SIZE - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask -# Load ymm with mem and byte swap each dword -.macro COPY_YMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm -# rotate_Ys -# Rotate values of symbols Y0...Y3 -.macro rotate_Ys - Y_ = Y_0 - Y_0 = Y_1 - Y_1 = Y_2 - Y_2 = Y_3 - Y_3 = Y_ -.endm - -# RotateState -.macro RotateState - # Rotate symbols a..h right - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL -# YDST = {YSRC1, YSRC2} >> RVAL*8 -.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL - vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} - vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 -.endm - -.macro FOUR_ROUNDS_AND_SCHED -################################### RND N + 0 ######################################### - - # Extract w[t-7] - MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] - # Calculate w[t-16] + w[t-7] - vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] - # Extract w[t-15] - MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] - - # Calculate sigma0 - - # Calculate w[t-15] ror 1 - vpsrlq $1, YTMP1, YTMP2 - vpsllq $(64-1), YTMP1, YTMP3 - vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 - # Calculate w[t-15] shr 7 - vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add frame_XFER(%rsp),h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - rorx $14, e, y1 # y1 = (e >> 14) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -################################### RND N + 1 ######################################### - - # Calculate w[t-15] ror 8 - vpsrlq $8, YTMP1, YTMP2 - vpsllq $(64-8), YTMP1, YTMP1 - vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 - # XOR the three components - vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 - vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 - - - # Add three components, w[t-16], w[t-7] and sigma0 - vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 - # Move to appropriate lanes for calculating w[16] and w[17] - vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} - # Move to appropriate lanes for calculating w[18] and w[19] - vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} - - # Calculate w[16] and w[17] in both 128 bit lanes - - # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes - vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} - vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} - - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - - rorx $28, a, T1 # T1 = (a >> 28) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - - -################################### RND N + 2 ######################################### - - vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} - vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} - vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} - vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ - # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} - - # Add sigma1 to the other compunents to get w[16] and w[17] - vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} - - # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane - vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- - - rorx $18, e, y1 # y1 = e >> 18 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - and e, y2 # y2 = (f^g)&e # CH - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -################################### RND N + 3 ######################################### - - vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} - vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} - vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} - vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ - # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} - - # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] - # to newly calculated sigma1 to get w[18] and w[19] - vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} - - # Form w[19, w[18], w17], w[16] - vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - rorx $39, a, y1 # y1 = a >> 39 # S0A - add y0, y2 # y2 = S1 + CH # -- - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - - rotate_Ys -.endm - -.macro DO_4ROUNDS - -################################### RND N + 0 ######################################### - - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 1 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 2 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 3 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -.endm - -######################################################################## -# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_rorx) - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - - shl $7, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, frame_INPEND(%rsp) - - ## load initial digest - mov 8*0(CTX1), a - mov 8*1(CTX1), b - mov 8*2(CTX1), c - mov 8*3(CTX1), d - mov 8*4(CTX1), e - mov 8*5(CTX1), f - mov 8*6(CTX1), g - mov 8*7(CTX1), h - - # save %rdi (CTX) before it gets clobbered - mov %rdi, frame_CTX(%rsp) - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - -.Lloop0: - lea K512(%rip), TBL - - ## byte swap first 16 dwords - COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK - - mov INP, frame_INP(%rsp) - - ## schedule 64 input dwords, by doing 12 rounds of 4 each - movq $4, frame_SRND(%rsp) - -.align 16 -.Lloop1: - vpaddq (TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 1*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 2*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 3*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - add $(4*32), TBL - FOUR_ROUNDS_AND_SCHED - - subq $1, frame_SRND(%rsp) - jne .Lloop1 - - movq $2, frame_SRND(%rsp) -.Lloop2: - vpaddq (TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - DO_4ROUNDS - vpaddq 1*32(TBL), Y_1, XFER - vmovdqa XFER, frame_XFER(%rsp) - add $(2*32), TBL - DO_4ROUNDS - - vmovdqa Y_2, Y_0 - vmovdqa Y_3, Y_1 - - subq $1, frame_SRND(%rsp) - jne .Lloop2 - - mov frame_CTX(%rsp), CTX2 - addm 8*0(CTX2), a - addm 8*1(CTX2), b - addm 8*2(CTX2), c - addm 8*3(CTX2), d - addm 8*4(CTX2), e - addm 8*5(CTX2), f - addm 8*6(CTX2), g - addm 8*7(CTX2), h - - mov frame_INP(%rsp), INP - add $128, INP - cmp frame_INPEND(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - vzeroupper - RET -SYM_FUNC_END(sha512_transform_rorx) - -######################################################################## -### Binary Data - - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x08090a0b0c0d0e0f0001020304050607 - .octa 0x18191a1b1c1d1e1f1011121314151617 - -.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 -.align 32 -MASK_YMM_LO: - .octa 0x00000000000000000000000000000000 - .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S deleted file mode 100644 index 30a2c4777f9d..000000000000 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ /dev/null @@ -1,425 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# David Cote -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include -#include - -.text - -# Virtual Registers -# ARG1 -digest = %rdi -# ARG2 -msg = %rsi -# ARG3 -msglen = %rdx -T1 = %rcx -T2 = %r8 -a_64 = %r9 -b_64 = %r10 -c_64 = %r11 -d_64 = %r12 -e_64 = %r13 -f_64 = %r14 -g_64 = %r15 -h_64 = %rbx -tmp0 = %rax - -# Local variables (stack frame) - -W_SIZE = 80*8 -WK_SIZE = 2*8 - -frame_W = 0 -frame_WK = frame_W + W_SIZE -frame_size = frame_WK + WK_SIZE - -# Useful QWORD "arrays" for simpler memory references -# MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even - -# Input message (arg1) -#define MSG(i) 8*i(msg) - -# Output Digest (arg2) -#define DIGEST(i) 8*i(digest) - -# SHA Constants (static mem) -#define K_t(i) 8*i+K512(%rip) - -# Message Schedule (stack frame) -#define W_t(i) 8*i+frame_W(%rsp) - -# W[t]+K[t] (stack frame) -#define WK_2(i) 8*((i%2))+frame_WK(%rsp) - -.macro RotateState - # Rotate symbols a..h right - TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = TMP -.endm - -.macro SHA512_Round rnd - - # Compute Round %%t - mov f_64, T1 # T1 = f - mov e_64, tmp0 # tmp = e - xor g_64, T1 # T1 = f ^ g - ror $23, tmp0 # 41 # tmp = e ror 23 - and e_64, T1 # T1 = (f ^ g) & e - xor e_64, tmp0 # tmp = (e ror 23) ^ e - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) - idx = \rnd - add WK_2(idx), T1 # W[t] + K[t] from message scheduler - ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e - mov a_64, T2 # T2 = a - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h - ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) - mov a_64, tmp0 # tmp = a - xor c_64, T2 # T2 = a ^ c - and c_64, tmp0 # tmp = a & c - and b_64, T2 # T2 = (a ^ c) & b - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) - mov a_64, tmp0 # tmp = a - ror $5, tmp0 # 39 # tmp = a ror 5 - xor a_64, tmp0 # tmp = (a ror 5) ^ a - add T1, d_64 # e(next_state) = d + T1 - ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) - ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) - RotateState -.endm - -.macro SHA512_2Sched_2Round_sse rnd - - # Compute rounds t-2 and t-1 - # Compute message schedule QWORDS t and t+1 - - # Two rounds are computed based on the values for K[t-2]+W[t-2] and - # K[t-1]+W[t-1] which were previously stored at WK_2 by the message - # scheduler. - # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - # They are then added to their respective SHA512 constants at - # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - # For brievity, the comments following vectored instructions only refer to - # the first of a pair of QWORDS. - # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} - # The computation of the message schedule and the rounds are tightly - # stitched to take advantage of instruction-level parallelism. - # For clarity, integer instructions (for the rounds calculation) are indented - # by one tab. Vectored instructions (for the message scheduler) are indented - # by two tabs. - - mov f_64, T1 - idx = \rnd -2 - movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] - xor g_64, T1 - and e_64, T1 - movdqa %xmm2, %xmm0 # XMM0 = W[t-2] - xor g_64, T1 - idx = \rnd - add WK_2(idx), T1 - idx = \rnd - 15 - movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] - mov e_64, tmp0 - ror $23, tmp0 # 41 - movdqa %xmm5, %xmm3 # XMM3 = W[t-15] - xor e_64, tmp0 - ror $4, tmp0 # 18 - psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 - xor e_64, tmp0 - ror $14, tmp0 # 14 - psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 - add tmp0, T1 - add h_64, T1 - pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] - mov a_64, T2 - xor c_64, T2 - pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] - and b_64, T2 - mov a_64, tmp0 - psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 - and c_64, tmp0 - xor tmp0, T2 - psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 - mov a_64, tmp0 - ror $5, tmp0 # 39 - pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] - xor a_64, tmp0 - ror $6, tmp0 # 34 - pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] - xor a_64, tmp0 - ror $28, tmp0 # 28 - psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 - add tmp0, T2 - add T1, d_64 - psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 - lea (T1, T2), h_64 - RotateState - movdqa %xmm2, %xmm1 # XMM1 = W[t-2] - mov f_64, T1 - xor g_64, T1 - movdqa %xmm5, %xmm4 # XMM4 = W[t-15] - and e_64, T1 - xor g_64, T1 - psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 - idx = \rnd + 1 - add WK_2(idx), T1 - mov e_64, tmp0 - psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 - ror $23, tmp0 # 41 - xor e_64, tmp0 - pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] - ror $4, tmp0 # 18 - xor e_64, tmp0 - pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] - ror $14, tmp0 # 14 - add tmp0, T1 - psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 - add h_64, T1 - mov a_64, T2 - psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 - xor c_64, T2 - and b_64, T2 - pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) - mov a_64, tmp0 - and c_64, tmp0 - idx = \rnd - 7 - movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] - xor tmp0, T2 - pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) - mov a_64, tmp0 - paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) - ror $5, tmp0 # 39 - idx =\rnd-16 - paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] - xor a_64, tmp0 - paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] - ror $6, tmp0 # 34 - movdqa %xmm0, W_t(\rnd) # Store scheduled qwords - xor a_64, tmp0 - paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] - ror $28, tmp0 # 28 - idx = \rnd - movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds - add tmp0, T2 - add T1, d_64 - lea (T1, T2), h_64 - RotateState -.endm - -######################################################################## -## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, -## int blocks); -# (struct sha512_state is assumed to begin with u64 state[8]) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks. -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_ssse3) - - test msglen, msglen - je .Lnowork - - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - -.Lupdateblock: - -# Load state variables - mov DIGEST(0), a_64 - mov DIGEST(1), b_64 - mov DIGEST(2), c_64 - mov DIGEST(3), d_64 - mov DIGEST(4), e_64 - mov DIGEST(5), f_64 - mov DIGEST(6), g_64 - mov DIGEST(7), h_64 - - t = 0 - .rept 80/2 + 1 - # (80 rounds) / (2 rounds/iteration) + (1 iteration) - # +1 iteration because the scheduler leads hashing by 1 iteration - .if t < 2 - # BSWAP 2 QWORDS - movdqa XMM_QWORD_BSWAP(%rip), %xmm1 - movdqu MSG(t), %xmm0 - pshufb %xmm1, %xmm0 # BSWAP - movdqa %xmm0, W_t(t) # Store Scheduled Pair - paddq K_t(t), %xmm0 # Compute W[t]+K[t] - movdqa %xmm0, WK_2(t) # Store into WK for rounds - .elseif t < 16 - # BSWAP 2 QWORDS# Compute 2 Rounds - movdqu MSG(t), %xmm0 - pshufb %xmm1, %xmm0 # BSWAP - SHA512_Round t-2 # Round t-2 - movdqa %xmm0, W_t(t) # Store Scheduled Pair - paddq K_t(t), %xmm0 # Compute W[t]+K[t] - SHA512_Round t-1 # Round t-1 - movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK - .elseif t < 79 - # Schedule 2 QWORDS# Compute 2 Rounds - SHA512_2Sched_2Round_sse t - .else - # Compute 2 Rounds - SHA512_Round t-2 - SHA512_Round t-1 - .endif - t = t+2 - .endr - - # Update digest - add a_64, DIGEST(0) - add b_64, DIGEST(1) - add c_64, DIGEST(2) - add d_64, DIGEST(3) - add e_64, DIGEST(4) - add f_64, DIGEST(5) - add g_64, DIGEST(6) - add h_64, DIGEST(7) - - # Advance to next message block - add $16*8, msg - dec msglen - jnz .Lupdateblock - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - -.Lnowork: - RET -SYM_FUNC_END(sha512_transform_ssse3) - -######################################################################## -### Binary Data - -.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 -.align 16 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -XMM_QWORD_BSWAP: - .octa 0x08090a0b0c0d0e0f0001020304050607 - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c deleted file mode 100644 index 97744b7d2381..000000000000 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Cryptographic API. - * - * Glue code for the SHA512 Secure Hash Algorithm assembler - * implementation using supplemental SSE3 / AVX / AVX2 instructions. - * - * This file is based on sha512_generic.c - * - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void sha512_transform_ssse3(struct sha512_state *state, - const u8 *data, int blocks); - -static int sha512_update_x86(struct shash_desc *desc, const u8 *data, - unsigned int len, sha512_block_fn *sha512_xform) -{ - int remain; - - /* - * Make sure struct sha512_state begins directly with the SHA512 - * 512-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); - - kernel_fpu_begin(); - remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); - kernel_fpu_end(); - - return remain; -} - -static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha512_block_fn *sha512_xform) -{ - kernel_fpu_begin(); - sha512_base_do_finup(desc, data, len, sha512_xform); - kernel_fpu_end(); - - return sha512_base_finish(desc, out); -} - -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update_x86(desc, data, len, sha512_transform_ssse3); -} - -static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_ssse3); -} - -static struct shash_alg sha512_ssse3_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_ssse3_update, - .finup = sha512_ssse3_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_ssse3_update, - .finup = sha512_ssse3_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha512_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(sha512_ssse3_algs, - ARRAY_SIZE(sha512_ssse3_algs)); - return 0; -} - -static void unregister_sha512_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(sha512_ssse3_algs, - ARRAY_SIZE(sha512_ssse3_algs)); -} - -asmlinkage void sha512_transform_avx(struct sha512_state *state, - const u8 *data, int blocks); -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int sha512_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update_x86(desc, data, len, sha512_transform_avx); -} - -static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_avx); -} - -static struct shash_alg sha512_avx_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_avx_update, - .finup = sha512_avx_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_avx_update, - .finup = sha512_avx_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha512_avx(void) -{ - if (avx_usable()) - return crypto_register_shashes(sha512_avx_algs, - ARRAY_SIZE(sha512_avx_algs)); - return 0; -} - -static void unregister_sha512_avx(void) -{ - if (avx_usable()) - crypto_unregister_shashes(sha512_avx_algs, - ARRAY_SIZE(sha512_avx_algs)); -} - -asmlinkage void sha512_transform_rorx(struct sha512_state *state, - const u8 *data, int blocks); - -static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update_x86(desc, data, len, sha512_transform_rorx); -} - -static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_rorx); -} - -static struct shash_alg sha512_avx2_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_avx2_update, - .finup = sha512_avx2_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_avx2_update, - .finup = sha512_avx2_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static int register_sha512_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shashes(sha512_avx2_algs, - ARRAY_SIZE(sha512_avx2_algs)); - return 0; -} -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static void unregister_sha512_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shashes(sha512_avx2_algs, - ARRAY_SIZE(sha512_avx2_algs)); -} - -static int __init sha512_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha512_ssse3()) - goto fail; - - if (register_sha512_avx()) { - unregister_sha512_ssse3(); - goto fail; - } - - if (register_sha512_avx2()) { - unregister_sha512_avx(); - unregister_sha512_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha512_ssse3_mod_fini(void) -{ - unregister_sha512_avx2(); - unregister_sha512_avx(); - unregister_sha512_ssse3(); -} - -module_init(sha512_ssse3_mod_init); -module_exit(sha512_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha512-ssse3"); -MODULE_ALIAS_CRYPTO("sha512-avx"); -MODULE_ALIAS_CRYPTO("sha512-avx2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha384-ssse3"); -MODULE_ALIAS_CRYPTO("sha384-avx"); -MODULE_ALIAS_CRYPTO("sha384-avx2"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c3edb78484f0..dce127a69f13 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -183,6 +183,7 @@ config CRYPTO_LIB_SHA512_ARCH default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO default y if S390 default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 0fb93f2469b3..aaf445a85384 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -95,6 +95,9 @@ endif libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o +libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \ + x86/sha512-avx-asm.o \ + x86/sha512-avx2-asm.o endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S new file mode 100644 index 000000000000..0b5f69179d62 --- /dev/null +++ b/lib/crypto/x86/sha512-avx-asm.S @@ -0,0 +1,424 @@ +######################################################################## +# Implement fast SHA-512 with AVX instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# David Cote +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +# Message Schedule +W_SIZE = 80*8 +# W[t] + K[t] | W[t+1] + K[t+1] +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro RORQ p1 p2 + # shld is faster than ror on Sandybridge + shld $(64-\p2), \p1, \p1 +.endm + +.macro SHA512_Round rnd + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + RORQ tmp0, 23 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + RORQ tmp0, 5 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx rnd + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + + idx = \rnd - 2 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] + idx = \rnd - 15 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov f_64, T1 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 + mov e_64, tmp0 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 + xor g_64, T1 + RORQ tmp0, 23 # 41 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 + and e_64, T1 + xor e_64, tmp0 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1# + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 + RORQ tmp0, 4 # 18 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 + xor e_64, tmp0 + mov a_64, T2 + add h_64, T1 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 # 14 + add tmp0, T1 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 + mov a_64, tmp0 + xor c_64, T2 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 + and c_64, tmp0 + and b_64, T2 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor tmp0, T2 + mov a_64, tmp0 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 + RORQ tmp0, 5 # 39 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ + # W[t-15]>>7 ^ W[t-15]<<63 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 + add tmp0, h_64 + RotateState + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ + # W[t-2]<<25 + mov f_64, T1 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + mov e_64, tmp0 + xor g_64, T1 + idx = \rnd - 16 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + idx = \rnd - 7 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + RORQ tmp0, 23 # 41 + and e_64, T1 + xor e_64, tmp0 + xor g_64, T1 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 + idx = \rnd + 1 + add WK_2(idx), T1 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) + RORQ tmp0, 4 # 18 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor e_64, tmp0 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + + # s0(W[t-15]) + W[t-16] + mov a_64, T2 + add h_64, T1 + RORQ tmp0, 14 # 14 + add tmp0, T1 + idx = \rnd + vmovdqa %xmm0, W_t(idx) # Store W[t] + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + mov a_64, tmp0 + xor c_64, T2 + and c_64, tmp0 + and b_64, T2 + xor tmp0, T2 + mov a_64, tmp0 + RORQ tmp0, 5 # 39 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + add tmp0, h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_avx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_avx) + + test msglen, msglen + je .Lnowork + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + + # Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_avx t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + +.Lnowork: + RET +SYM_FUNC_END(sha512_transform_avx) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S new file mode 100644 index 000000000000..2309c01e316b --- /dev/null +++ b/lib/crypto/x86/sha512-avx2-asm.S @@ -0,0 +1,751 @@ +######################################################################## +# Implement fast SHA-512 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# David Cote +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 blocks at a time, with 4 lanes per block +######################################################################## + +#include + +.text + +# Virtual Registers +Y_0 = %ymm4 +Y_1 = %ymm5 +Y_2 = %ymm6 +Y_3 = %ymm7 + +YTMP0 = %ymm0 +YTMP1 = %ymm1 +YTMP2 = %ymm2 +YTMP3 = %ymm3 +YTMP4 = %ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = %ymm9 + +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 +# 2nd arg +INP = %rsi +# 3rd arg +NUM_BLKS = %rdx + +c = %rcx +d = %r8 +e = %rdx +y3 = %rsi + +TBL = %rdi # clobbers CTX1 + +a = %rax +b = %rbx + +f = %r9 +g = %r10 +h = %r11 +old_h = %r11 + +T1 = %r12 # clobbers CTX2 +y0 = %r13 +y1 = %r14 +y2 = %r15 + +# Local variables (stack frame) +XFER_SIZE = 4*8 +SRND_SIZE = 1*8 +INP_SIZE = 1*8 +INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 + +frame_XFER = 0 +frame_SRND = frame_XFER + XFER_SIZE +frame_INP = frame_SRND + SRND_SIZE +frame_INPEND = frame_INP + INP_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_size = frame_CTX + CTX_SIZE + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +# Load ymm with mem and byte swap each dword +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm +# rotate_Ys +# Rotate values of symbols Y0...Y3 +.macro rotate_Ys + Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = Y_ +.endm + +# RotateState +.macro RotateState + # Rotate symbols a..h right + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +# YDST = {YSRC1, YSRC2} >> RVAL*8 +.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 +.endm + +.macro FOUR_ROUNDS_AND_SCHED +################################### RND N + 0 ######################################### + + # Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] + # Calculate w[t-16] + w[t-7] + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] + # Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] + + # Calculate sigma0 + + # Calculate w[t-15] ror 1 + vpsrlq $1, YTMP1, YTMP2 + vpsllq $(64-1), YTMP1, YTMP3 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 + # Calculate w[t-15] shr 7 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add frame_XFER(%rsp),h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + rorx $14, e, y1 # y1 = (e >> 14) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 1 ######################################### + + # Calculate w[t-15] ror 8 + vpsrlq $8, YTMP1, YTMP2 + vpsllq $(64-8), YTMP1, YTMP1 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 + # XOR the three components + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 + + + # Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 + # Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} + # Move to appropriate lanes for calculating w[18] and w[19] + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} + + # Calculate w[16] and w[17] in both 128 bit lanes + + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} + + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + +################################### RND N + 2 ######################################### + + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + # Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} + + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- + + rorx $18, e, y1 # y1 = e >> 18 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + and e, y2 # y2 = (f^g)&e # CH + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 3 ######################################### + + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] + # to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} + + # Form w[19, w[18], w17], w[16] + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + rorx $39, a, y1 # y1 = a >> 39 # S0A + add y0, y2 # y2 = S1 + CH # -- + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + rotate_Ys +.endm + +.macro DO_4ROUNDS + +################################### RND N + 0 ######################################### + + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 1 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 2 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 3 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +.endm + +######################################################################## +# void sha512_transform_rorx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_rorx) + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + + shl $7, NUM_BLKS # convert to bytes + jz .Ldone_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, frame_INPEND(%rsp) + + ## load initial digest + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + +.Lloop0: + lea K512(%rip), TBL + + ## byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK + + mov INP, frame_INP(%rsp) + + ## schedule 64 input dwords, by doing 12 rounds of 4 each + movq $4, frame_SRND(%rsp) + +.align 16 +.Lloop1: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 1*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 2*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 3*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(4*32), TBL + FOUR_ROUNDS_AND_SCHED + + subq $1, frame_SRND(%rsp) + jne .Lloop1 + + movq $2, frame_SRND(%rsp) +.Lloop2: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + DO_4ROUNDS + vpaddq 1*32(TBL), Y_1, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(2*32), TBL + DO_4ROUNDS + + vmovdqa Y_2, Y_0 + vmovdqa Y_3, Y_1 + + subq $1, frame_SRND(%rsp) + jne .Lloop2 + + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h + + mov frame_INP(%rsp), INP + add $128, INP + cmp frame_INPEND(%rsp), INP + jne .Lloop0 + +.Ldone_hash: + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + vzeroupper + RET +SYM_FUNC_END(sha512_transform_rorx) + +######################################################################## +### Binary Data + + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 +.align 32 +MASK_YMM_LO: + .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S new file mode 100644 index 000000000000..12e78142f2e3 --- /dev/null +++ b/lib/crypto/x86/sha512-ssse3-asm.S @@ -0,0 +1,423 @@ +######################################################################## +# Implement fast SHA-512 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# David Cote +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +W_SIZE = 80*8 +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro SHA512_Round rnd + + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + ror $23, tmp0 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + ror $5, tmp0 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse rnd + + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + # For clarity, integer instructions (for the rounds calculation) are indented + # by one tab. Vectored instructions (for the message scheduler) are indented + # by two tabs. + + mov f_64, T1 + idx = \rnd -2 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] + xor g_64, T1 + and e_64, T1 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1 + idx = \rnd - 15 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov e_64, tmp0 + ror $23, tmp0 # 41 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] + xor e_64, tmp0 + ror $4, tmp0 # 18 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 + xor e_64, tmp0 + ror $14, tmp0 # 14 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 + add tmp0, T1 + add h_64, T1 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov a_64, T2 + xor c_64, T2 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] + and b_64, T2 + mov a_64, tmp0 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and c_64, tmp0 + xor tmp0, T2 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov a_64, tmp0 + ror $5, tmp0 # 39 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor a_64, tmp0 + ror $6, tmp0 # 34 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor a_64, tmp0 + ror $28, tmp0 # 28 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add tmp0, T2 + add T1, d_64 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea (T1, T2), h_64 + RotateState + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] + mov f_64, T1 + xor g_64, T1 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] + and e_64, T1 + xor g_64, T1 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 + idx = \rnd + 1 + add WK_2(idx), T1 + mov e_64, tmp0 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 + ror $23, tmp0 # 41 + xor e_64, tmp0 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] + ror $4, tmp0 # 18 + xor e_64, tmp0 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] + ror $14, tmp0 # 14 + add tmp0, T1 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add h_64, T1 + mov a_64, T2 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor c_64, T2 + and b_64, T2 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + mov a_64, tmp0 + and c_64, tmp0 + idx = \rnd - 7 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + xor tmp0, T2 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) + mov a_64, tmp0 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror $5, tmp0 # 39 + idx =\rnd-16 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor a_64, tmp0 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror $6, tmp0 # 34 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords + xor a_64, tmp0 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] + ror $28, tmp0 # 28 + idx = \rnd + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + add tmp0, T2 + add T1, d_64 + lea (T1, T2), h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_ssse3(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_ssse3) + + test msglen, msglen + je .Lnowork + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + +# Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + movdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_sse t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + +.Lnowork: + RET +SYM_FUNC_END(sha512_transform_ssse3) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h new file mode 100644 index 000000000000..c13503d9d57d --- /dev/null +++ b/lib/crypto/x86/sha512.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * x86-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include +#include +#include + +DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); + +#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha512_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha512_block_state *state, const u8 *data, \ + size_t nblocks) \ + { \ + if (likely(crypto_simd_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha512_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); +DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); +DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha512_blocks_x86)(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha512_blocks_x86, + sha512_blocks_avx2); + else + static_call_update(sha512_blocks_x86, + sha512_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); + } +} -- cgit v1.2.3 From 6486f2b0368dca5b7ba003d3904bfc64ff2439d5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:19 -0700 Subject: lib/crypto: x86/sha512: Remove unnecessary checks for nblocks==0 Since sha512_blocks() is called only with nblocks >= 1, remove unnecessary checks for nblocks == 0 from the x86 SHA-512 assembly code. Link: https://lore.kernel.org/r/20250630160320.2888-16-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha512-avx-asm.S | 6 +----- lib/crypto/x86/sha512-avx2-asm.S | 5 +---- lib/crypto/x86/sha512-ssse3-asm.S | 6 +----- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S index 0b5f69179d62..7732aa8fd850 100644 --- a/lib/crypto/x86/sha512-avx-asm.S +++ b/lib/crypto/x86/sha512-avx-asm.S @@ -272,13 +272,10 @@ frame_size = frame_WK + WK_SIZE # stored in "data". # The size of the message pointed to by "data" must be an integer multiple # of SHA512 message blocks. -# "nblocks" is the message length in SHA512 blocks +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. ######################################################################## SYM_FUNC_START(sha512_transform_avx) - test msglen, msglen - je .Lnowork - # Save GPRs push %rbx push %r12 @@ -362,7 +359,6 @@ SYM_FUNC_START(sha512_transform_avx) pop %r12 pop %rbx -.Lnowork: RET SYM_FUNC_END(sha512_transform_avx) diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S index 2309c01e316b..22bdbfd899d0 100644 --- a/lib/crypto/x86/sha512-avx2-asm.S +++ b/lib/crypto/x86/sha512-avx2-asm.S @@ -564,7 +564,7 @@ frame_size = frame_CTX + CTX_SIZE # stored in "data". # The size of the message pointed to by "data" must be an integer multiple # of SHA512 message blocks. -# "nblocks" is the message length in SHA512 blocks +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. ######################################################################## SYM_FUNC_START(sha512_transform_rorx) @@ -582,7 +582,6 @@ SYM_FUNC_START(sha512_transform_rorx) and $~(0x20 - 1), %rsp shl $7, NUM_BLKS # convert to bytes - jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, frame_INPEND(%rsp) @@ -668,8 +667,6 @@ SYM_FUNC_START(sha512_transform_rorx) cmp frame_INPEND(%rsp), INP jne .Lloop0 -.Ldone_hash: - # Restore Stack Pointer mov %rbp, %rsp pop %rbp diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S index 12e78142f2e3..4cae7445b2a8 100644 --- a/lib/crypto/x86/sha512-ssse3-asm.S +++ b/lib/crypto/x86/sha512-ssse3-asm.S @@ -271,13 +271,10 @@ frame_size = frame_WK + WK_SIZE # stored in "data". # The size of the message pointed to by "data" must be an integer multiple # of SHA512 message blocks. -# "nblocks" is the message length in SHA512 blocks +# "nblocks" is the message length in SHA512 blocks. Must be >= 1. ######################################################################## SYM_FUNC_START(sha512_transform_ssse3) - test msglen, msglen - je .Lnowork - # Save GPRs push %rbx push %r12 @@ -361,7 +358,6 @@ SYM_FUNC_START(sha512_transform_ssse3) pop %r12 pop %rbx -.Lnowork: RET SYM_FUNC_END(sha512_transform_ssse3) -- cgit v1.2.3 From 9b5c0d82b26d10733f67e10ea1889fc24aa6840a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:03:20 -0700 Subject: crypto: sha512 - Remove sha512_base.h sha512_base.h is no longer used, so remove it. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160320.2888-17-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha512_base.h | 117 ------------------------------------------- 1 file changed, 117 deletions(-) delete mode 100644 include/crypto/sha512_base.h diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h deleted file mode 100644 index d1361b3eb70b..000000000000 --- a/include/crypto/sha512_base.h +++ /dev/null @@ -1,117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha512_base.h - core logic for SHA-512 implementations - * - * Copyright (C) 2015 Linaro Ltd - */ - -#ifndef _CRYPTO_SHA512_BASE_H -#define _CRYPTO_SHA512_BASE_H - -#include -#include -#include -#include -#include -#include -#include - -typedef void (sha512_block_fn)(struct sha512_state *sst, u8 const *src, - int blocks); - -static inline int sha384_base_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA384_H0; - sctx->state[1] = SHA384_H1; - sctx->state[2] = SHA384_H2; - sctx->state[3] = SHA384_H3; - sctx->state[4] = SHA384_H4; - sctx->state[5] = SHA384_H5; - sctx->state[6] = SHA384_H6; - sctx->state[7] = SHA384_H7; - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} - -static inline int sha512_base_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA512_H0; - sctx->state[1] = SHA512_H1; - sctx->state[2] = SHA512_H2; - sctx->state[3] = SHA512_H3; - sctx->state[4] = SHA512_H4; - sctx->state[5] = SHA512_H5; - sctx->state[6] = SHA512_H6; - sctx->state[7] = SHA512_H7; - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} - -static inline int sha512_base_do_update_blocks(struct shash_desc *desc, - const u8 *data, - unsigned int len, - sha512_block_fn *block_fn) -{ - unsigned int remain = len - round_down(len, SHA512_BLOCK_SIZE); - struct sha512_state *sctx = shash_desc_ctx(desc); - - len -= remain; - sctx->count[0] += len; - if (sctx->count[0] < len) - sctx->count[1]++; - block_fn(sctx, data, len / SHA512_BLOCK_SIZE); - return remain; -} - -static inline int sha512_base_do_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, - sha512_block_fn *block_fn) -{ - unsigned int bit_offset = SHA512_BLOCK_SIZE / 8 - 2; - struct sha512_state *sctx = shash_desc_ctx(desc); - union { - __be64 b64[SHA512_BLOCK_SIZE / 4]; - u8 u8[SHA512_BLOCK_SIZE * 2]; - } block = {}; - - if (len >= SHA512_BLOCK_SIZE) { - int remain; - - remain = sha512_base_do_update_blocks(desc, src, len, block_fn); - src += len - remain; - len = remain; - } - - if (len >= bit_offset * 8) - bit_offset += SHA512_BLOCK_SIZE / 8; - memcpy(&block, src, len); - block.u8[len] = 0x80; - sctx->count[0] += len; - block.b64[bit_offset] = cpu_to_be64(sctx->count[1] << 3 | - sctx->count[0] >> 61); - block.b64[bit_offset + 1] = cpu_to_be64(sctx->count[0] << 3); - block_fn(sctx, block.u8, (bit_offset + 2) * 8 / SHA512_BLOCK_SIZE); - memzero_explicit(&block, sizeof(block)); - - return 0; -} - -static inline int sha512_base_finish(struct shash_desc *desc, u8 *out) -{ - unsigned int digest_size = crypto_shash_digestsize(desc->tfm); - struct sha512_state *sctx = shash_desc_ctx(desc); - __be64 *digest = (__be64 *)out; - int i; - - for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64)) - put_unaligned_be64(sctx->state[i], digest++); - return 0; -} - -#endif /* _CRYPTO_SHA512_BASE_H */ -- cgit v1.2.3 From 4a32e5dc1dcfa8f1ed6ca33328e71dcb6196c09e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:00 -0700 Subject: lib/crypto: arm: Move arch/arm/lib/crypto/ into lib/crypto/ Move the contents of arch/arm/lib/crypto/ into lib/crypto/arm/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/arm/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/lib/.gitignore | 4 + arch/arm/lib/Makefile | 2 - arch/arm/lib/crypto/.gitignore | 3 - arch/arm/lib/crypto/Kconfig | 31 - arch/arm/lib/crypto/Makefile | 32 - arch/arm/lib/crypto/blake2s-core.S | 306 -------- arch/arm/lib/crypto/blake2s-glue.c | 7 - arch/arm/lib/crypto/chacha-glue.c | 138 ---- arch/arm/lib/crypto/chacha-neon-core.S | 643 ---------------- arch/arm/lib/crypto/chacha-scalar-core.S | 444 ----------- arch/arm/lib/crypto/poly1305-armv4.pl | 1236 ------------------------------ arch/arm/lib/crypto/poly1305-glue.c | 80 -- arch/arm/lib/crypto/sha256-armv4.pl | 724 ----------------- arch/arm/lib/crypto/sha256-ce.S | 123 --- arch/arm/lib/crypto/sha256.c | 64 -- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 2 + lib/crypto/arm/.gitignore | 2 + lib/crypto/arm/Kconfig | 31 + lib/crypto/arm/Makefile | 32 + lib/crypto/arm/blake2s-core.S | 306 ++++++++ lib/crypto/arm/blake2s-glue.c | 7 + lib/crypto/arm/chacha-glue.c | 138 ++++ lib/crypto/arm/chacha-neon-core.S | 643 ++++++++++++++++ lib/crypto/arm/chacha-scalar-core.S | 444 +++++++++++ lib/crypto/arm/poly1305-armv4.pl | 1236 ++++++++++++++++++++++++++++++ lib/crypto/arm/poly1305-glue.c | 80 ++ lib/crypto/arm/sha256-armv4.pl | 724 +++++++++++++++++ lib/crypto/arm/sha256-ce.S | 123 +++ lib/crypto/arm/sha256.c | 64 ++ 30 files changed, 3837 insertions(+), 3834 deletions(-) create mode 100644 arch/arm/lib/.gitignore delete mode 100644 arch/arm/lib/crypto/.gitignore delete mode 100644 arch/arm/lib/crypto/Kconfig delete mode 100644 arch/arm/lib/crypto/Makefile delete mode 100644 arch/arm/lib/crypto/blake2s-core.S delete mode 100644 arch/arm/lib/crypto/blake2s-glue.c delete mode 100644 arch/arm/lib/crypto/chacha-glue.c delete mode 100644 arch/arm/lib/crypto/chacha-neon-core.S delete mode 100644 arch/arm/lib/crypto/chacha-scalar-core.S delete mode 100644 arch/arm/lib/crypto/poly1305-armv4.pl delete mode 100644 arch/arm/lib/crypto/poly1305-glue.c delete mode 100644 arch/arm/lib/crypto/sha256-armv4.pl delete mode 100644 arch/arm/lib/crypto/sha256-ce.S delete mode 100644 arch/arm/lib/crypto/sha256.c create mode 100644 lib/crypto/arm/Kconfig create mode 100644 lib/crypto/arm/Makefile create mode 100644 lib/crypto/arm/blake2s-core.S create mode 100644 lib/crypto/arm/blake2s-glue.c create mode 100644 lib/crypto/arm/chacha-glue.c create mode 100644 lib/crypto/arm/chacha-neon-core.S create mode 100644 lib/crypto/arm/chacha-scalar-core.S create mode 100644 lib/crypto/arm/poly1305-armv4.pl create mode 100644 lib/crypto/arm/poly1305-glue.c create mode 100644 lib/crypto/arm/sha256-armv4.pl create mode 100644 lib/crypto/arm/sha256-ce.S create mode 100644 lib/crypto/arm/sha256.c diff --git a/arch/arm/lib/.gitignore b/arch/arm/lib/.gitignore new file mode 100644 index 000000000000..647d7a922e68 --- /dev/null +++ b/arch/arm/lib/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 91ea0e29107a..d05dd672bcd9 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -5,8 +5,6 @@ # Copyright (C) 1995-2000 Russell King # -obj-y += crypto/ - lib-y := changebit.o csumipv6.o csumpartial.o \ csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ delay.o delay-loop.o findbit.o memchr.o memcpy.o \ diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore deleted file mode 100644 index 12d74d8b03d0..000000000000 --- a/arch/arm/lib/crypto/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-core.S -sha256-core.S diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig deleted file mode 100644 index d1ad664f0c67..000000000000 --- a/arch/arm/lib/crypto/Kconfig +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_ARM - bool "Hash functions: BLAKE2s" - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: arm - - This is faster than the generic implementations of BLAKE2s and - BLAKE2b, but slower than the NEON implementation of BLAKE2b. - There is no NEON implementation of BLAKE2s, since NEON doesn't - really help with it. - -config CRYPTO_CHACHA20_NEON - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_ARM - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_ARM - tristate - depends on !CPU_V7M - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile deleted file mode 100644 index 431f77c3ff6f..000000000000 --- a/arch/arm/lib/crypto/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o -libblake2s-arm-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-scalar-core.o chacha-glue.o -chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o - -obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o -poly1305-arm-y := poly1305-core.o poly1305-glue.o - -obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o -sha256-arm-y := sha256.o sha256-core.o -sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += poly1305-core.S sha256-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -# massage the perlasm code a bit so we only get the NEON routine if we need it -poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 -poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) - -AFLAGS_sha256-core.o += $(aflags-thumb2-y) diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S deleted file mode 100644 index df40e46601f1..000000000000 --- a/arch/arm/lib/crypto/blake2s-core.S +++ /dev/null @@ -1,306 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * BLAKE2s digest algorithm, ARM scalar implementation - * - * Copyright 2020 Google LLC - * - * Author: Eric Biggers - */ - -#include -#include - - // Registers used to hold message words temporarily. There aren't - // enough ARM registers to hold the whole message block, so we have to - // load the words on-demand. - M_0 .req r12 - M_1 .req r14 - -// The BLAKE2s initialization vector -.Lblake2s_IV: - .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - -.macro __ldrd a, b, src, offset -#if __LINUX_ARM_ARCH__ >= 6 - ldrd \a, \b, [\src, #\offset] -#else - ldr \a, [\src, #\offset] - ldr \b, [\src, #\offset + 4] -#endif -.endm - -.macro __strd a, b, dst, offset -#if __LINUX_ARM_ARCH__ >= 6 - strd \a, \b, [\dst, #\offset] -#else - str \a, [\dst, #\offset] - str \b, [\dst, #\offset + 4] -#endif -.endm - -.macro _le32_bswap a, tmp -#ifdef __ARMEB__ - rev_l \a, \tmp -#endif -.endm - -.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp - _le32_bswap \a, \tmp - _le32_bswap \b, \tmp - _le32_bswap \c, \tmp - _le32_bswap \d, \tmp - _le32_bswap \e, \tmp - _le32_bswap \f, \tmp - _le32_bswap \g, \tmp - _le32_bswap \h, \tmp -.endm - -// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. -// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two -// columns/diagonals. s0-s1 are the word offsets to the message words the first -// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. -// M_0 and M_1 are free to use, and the message block can be found at sp + 32. -// -// Note that to save instructions, the rotations don't happen when the -// pseudocode says they should, but rather they are delayed until the values are -// used. See the comment above _blake2s_round(). -.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 - - ldr M_0, [sp, #32 + 4 * \s0] - ldr M_1, [sp, #32 + 4 * \s2] - - // a += b + m[blake2s_sigma[r][2*i + 0]]; - add \a0, \a0, \b0, ror #brot - add \a1, \a1, \b1, ror #brot - add \a0, \a0, M_0 - add \a1, \a1, M_1 - - // d = ror32(d ^ a, 16); - eor \d0, \a0, \d0, ror #drot - eor \d1, \a1, \d1, ror #drot - - // c += d; - add \c0, \c0, \d0, ror #16 - add \c1, \c1, \d1, ror #16 - - // b = ror32(b ^ c, 12); - eor \b0, \c0, \b0, ror #brot - eor \b1, \c1, \b1, ror #brot - - ldr M_0, [sp, #32 + 4 * \s1] - ldr M_1, [sp, #32 + 4 * \s3] - - // a += b + m[blake2s_sigma[r][2*i + 1]]; - add \a0, \a0, \b0, ror #12 - add \a1, \a1, \b1, ror #12 - add \a0, \a0, M_0 - add \a1, \a1, M_1 - - // d = ror32(d ^ a, 8); - eor \d0, \a0, \d0, ror#16 - eor \d1, \a1, \d1, ror#16 - - // c += d; - add \c0, \c0, \d0, ror#8 - add \c1, \c1, \d1, ror#8 - - // b = ror32(b ^ c, 7); - eor \b0, \c0, \b0, ror#12 - eor \b1, \c1, \b1, ror#12 -.endm - -// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] -// are in r0..r9. The stack pointer points to 8 bytes of scratch space for -// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and -// r14 are free to use. The macro arguments s0-s15 give the order in which the -// message words are used in this round. -// -// All rotates are performed using the implicit rotate operand accepted by the -// 'add' and 'eor' instructions. This is faster than using explicit rotate -// instructions. To make this work, we allow the values in the second and last -// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the -// wrong rotation amount. The rotation amount is then fixed up just in time -// when the values are used. 'brot' is the number of bits the values in row 'b' -// need to be rotated right to arrive at the correct values, and 'drot' -// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such -// that they end up as (7, 8) after every round. -.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ - s8, s9, s10, s11, s12, s13, s14, s15 - - // Mix first two columns: - // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). - __ldrd r10, r11, sp, 16 // load v[12] and v[13] - _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ - \s0, \s1, \s2, \s3 - __strd r8, r9, sp, 0 - __strd r10, r11, sp, 16 - - // Mix second two columns: - // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). - __ldrd r8, r9, sp, 8 // load v[10] and v[11] - __ldrd r10, r11, sp, 24 // load v[14] and v[15] - _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ - \s4, \s5, \s6, \s7 - str r10, [sp, #24] // store v[14] - // v[10], v[11], and v[15] are used below, so no need to store them yet. - - .set brot, 7 - .set drot, 8 - - // Mix first two diagonals: - // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). - ldr r10, [sp, #16] // load v[12] - _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ - \s8, \s9, \s10, \s11 - __strd r8, r9, sp, 8 - str r11, [sp, #28] - str r10, [sp, #16] - - // Mix second two diagonals: - // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). - __ldrd r8, r9, sp, 0 // load v[8] and v[9] - __ldrd r10, r11, sp, 20 // load v[13] and v[14] - _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ - \s12, \s13, \s14, \s15 - __strd r10, r11, sp, 20 -.endm - -// -// void blake2s_compress(struct blake2s_state *state, -// const u8 *block, size_t nblocks, u32 inc); -// -// Only the first three fields of struct blake2s_state are used: -// u32 h[8]; (inout) -// u32 t[2]; (inout) -// u32 f[2]; (in) -// - .align 5 -ENTRY(blake2s_compress) - push {r0-r2,r4-r11,lr} // keep this an even number - -.Lnext_block: - // r0 is 'state' - // r1 is 'block' - // r3 is 'inc' - - // Load and increment the counter t[0..1]. - __ldrd r10, r11, r0, 32 - adds r10, r10, r3 - adc r11, r11, #0 - __strd r10, r11, r0, 32 - - // _blake2s_round is very short on registers, so copy the message block - // to the stack to save a register during the rounds. This also has the - // advantage that misalignment only needs to be dealt with in one place. - sub sp, sp, #64 - mov r12, sp - tst r1, #3 - bne .Lcopy_block_misaligned - ldmia r1!, {r2-r9} - _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 - stmia r12!, {r2-r9} - ldmia r1!, {r2-r9} - _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 - stmia r12, {r2-r9} -.Lcopy_block_done: - str r1, [sp, #68] // Update message pointer - - // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space - // for spilling v[8..9]. Leave v[8..9] in r8-r9. - mov r14, r0 // r14 = state - adr r12, .Lblake2s_IV - ldmia r12!, {r8-r9} // load IV[0..1] - __ldrd r0, r1, r14, 40 // load f[0..1] - ldm r12, {r2-r7} // load IV[3..7] - eor r4, r4, r10 // v[12] = IV[4] ^ t[0] - eor r5, r5, r11 // v[13] = IV[5] ^ t[1] - eor r6, r6, r0 // v[14] = IV[6] ^ f[0] - eor r7, r7, r1 // v[15] = IV[7] ^ f[1] - push {r2-r7} // push v[9..15] - sub sp, sp, #8 // leave space for v[8..9] - - // Load h[0..7] == v[0..7]. - ldm r14, {r0-r7} - - // Execute the rounds. Each round is provided the order in which it - // needs to use the message words. - .set brot, 0 - .set drot, 0 - _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 - _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 - _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 - _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 - _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 - _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 - _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 - _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 - _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 - - // Fold the final state matrix into the hash chaining value: - // - // for (i = 0; i < 8; i++) - // h[i] ^= v[i] ^ v[i + 8]; - // - ldr r14, [sp, #96] // r14 = &h[0] - add sp, sp, #8 // v[8..9] are already loaded. - pop {r10-r11} // load v[10..11] - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - eor r3, r3, r11 - ldm r14, {r8-r11} // load h[0..3] - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - eor r3, r3, r11 - stmia r14!, {r0-r3} // store new h[0..3] - ldm r14, {r0-r3} // load old h[4..7] - pop {r8-r11} // load v[12..15] - eor r0, r0, r4, ror #brot - eor r1, r1, r5, ror #brot - eor r2, r2, r6, ror #brot - eor r3, r3, r7, ror #brot - eor r0, r0, r8, ror #drot - eor r1, r1, r9, ror #drot - eor r2, r2, r10, ror #drot - eor r3, r3, r11, ror #drot - add sp, sp, #64 // skip copy of message block - stm r14, {r0-r3} // store new h[4..7] - - // Advance to the next block, if there is one. Note that if there are - // multiple blocks, then 'inc' (the counter increment amount) must be - // 64. So we can simply set it to 64 without re-loading it. - ldm sp, {r0, r1, r2} // load (state, block, nblocks) - mov r3, #64 // set 'inc' - subs r2, r2, #1 // nblocks-- - str r2, [sp, #8] - bne .Lnext_block // nblocks != 0? - - pop {r0-r2,r4-r11,pc} - - // The next message block (pointed to by r1) isn't 4-byte aligned, so it - // can't be loaded using ldmia. Copy it to the stack buffer (pointed to - // by r12) using an alternative method. r2-r9 are free to use. -.Lcopy_block_misaligned: - mov r2, #64 -1: -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - ldr r3, [r1], #4 - _le32_bswap r3, r4 -#else - ldrb r3, [r1, #0] - ldrb r4, [r1, #1] - ldrb r5, [r1, #2] - ldrb r6, [r1, #3] - add r1, r1, #4 - orr r3, r3, r4, lsl #8 - orr r3, r3, r5, lsl #16 - orr r3, r3, r6, lsl #24 -#endif - subs r2, r2, #4 - str r3, [r12], #4 - bne 1b - b .Lcopy_block_done -ENDPROC(blake2s_compress) diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c deleted file mode 100644 index 0238a70d9581..000000000000 --- a/arch/arm/lib/crypto/blake2s-glue.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -/* defined in blake2s-core.S */ -EXPORT_SYMBOL(blake2s_compress); diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c deleted file mode 100644 index 88ec96415283..000000000000 --- a/arch/arm/lib/crypto/chacha-glue.c +++ /dev/null @@ -1,138 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (ARM optimized) - * - * Copyright (C) 2016-2019 Linaro, Ltd. - * Copyright (C) 2015 Martin Willi - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, int nrounds); -asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, - int nrounds, unsigned int nbytes); -asmlinkage void hchacha_block_arm(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -asmlinkage void hchacha_block_neon(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, - const struct chacha_state *state, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); - -static inline bool neon_usable(void) -{ - return static_branch_likely(&use_neon) && crypto_simd_usable(); -} - -static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes > CHACHA_BLOCK_SIZE) { - unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); - - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } - if (bytes) { - const u8 *s = src; - u8 *d = dst; - - if (bytes != CHACHA_BLOCK_SIZE) - s = d = memcpy(buf, src, bytes); - chacha_block_xor_neon(state, d, s, nrounds); - if (d != dst) - memcpy(dst, buf, bytes); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { - hchacha_block_arm(state, out, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || - bytes <= CHACHA_BLOCK_SIZE) { - chacha_doarm(dst, src, bytes, state, nrounds); - state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); - return; - } - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_arm_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { - switch (read_cpuid_part()) { - case ARM_CPU_PART_CORTEX_A7: - case ARM_CPU_PART_CORTEX_A5: - /* - * The Cortex-A7 and Cortex-A5 do not perform well with - * the NEON implementation but do incredibly with the - * scalar one and use less power. - */ - break; - default: - static_branch_enable(&use_neon); - } - } - return 0; -} -subsys_initcall(chacha_arm_mod_init); - -static void __exit chacha_arm_mod_exit(void) -{ -} -module_exit(chacha_arm_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S deleted file mode 100644 index ddd62b6294a5..000000000000 --- a/arch/arm/lib/crypto/chacha-neon-core.S +++ /dev/null @@ -1,643 +0,0 @@ -/* - * ChaCha/HChaCha NEON helper functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - - /* - * NEON doesn't have a rotate instruction. The alternatives are, more or less: - * - * (a) vshl.u32 + vsri.u32 (needs temporary register) - * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) - * (c) vrev32.16 (16-bit rotations only) - * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, - * needs index vector) - * - * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, - * the only choices are (a) and (b). We use (a) since it takes two-thirds the - * cycles of (b) on both Cortex-A7 and Cortex-A53. - * - * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest - * and doesn't need a temporary register. - * - * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence - * is twice as fast as (a), even when doing (a) on multiple registers - * simultaneously to eliminate the stall between vshl and vsri. Also, it - * parallelizes better when temporary registers are scarce. - * - * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as - * (a), so the need to load the rotation table actually makes the vtbl method - * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it - * seems to be a good compromise to get a more significant speed boost on some - * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. - */ - -#include -#include - - .text - .fpu neon - .align 5 - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers q0-q3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * The round count is given in r3. - * - * Clobbers: r3, ip, q4-q5 - */ -chacha_permute: - - adr ip, .Lrol8_table - vld1.8 {d10}, [ip, :64] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vext.8 q1, q1, q1, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vext.8 q3, q3, q3, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vext.8 q1, q1, q1, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vext.8 q3, q3, q3, #4 - - subs r3, r3, #2 - bne .Ldoubleround - - bx lr -ENDPROC(chacha_permute) - -ENTRY(chacha_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - // r3: nrounds - push {lr} - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 - - bl chacha_permute - - add ip, r2, #0x20 - vld1.8 {q4-q5}, [r2] - vld1.8 {q6-q7}, [ip] - - // o0 = i0 ^ (x0 + s0) - vadd.i32 q0, q0, q8 - veor q0, q0, q4 - - // o1 = i1 ^ (x1 + s1) - vadd.i32 q1, q1, q9 - veor q1, q1, q5 - - // o2 = i2 ^ (x2 + s2) - vadd.i32 q2, q2, q10 - veor q2, q2, q6 - - // o3 = i3 ^ (x3 + s3) - vadd.i32 q3, q3, q11 - veor q3, q3, q7 - - add ip, r1, #0x20 - vst1.8 {q0-q1}, [r1] - vst1.8 {q2-q3}, [ip] - - pop {pc} -ENDPROC(chacha_block_xor_neon) - -ENTRY(hchacha_block_neon) - // r0: Input state matrix, s - // r1: output (8 32-bit words) - // r2: nrounds - push {lr} - - vld1.32 {q0-q1}, [r0]! - vld1.32 {q2-q3}, [r0] - - mov r3, r2 - bl chacha_permute - - vst1.32 {q0}, [r1]! - vst1.32 {q3}, [r1] - - pop {pc} -ENDPROC(hchacha_block_neon) - - .align 4 -.Lctrinc: .word 0, 1, 2, 3 -.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 - - .align 5 -ENTRY(chacha_4block_xor_neon) - push {r4, lr} - mov r4, sp // preserve the stack pointer - sub ip, sp, #0x20 // allocate a 32 byte buffer - bic ip, ip, #0x1f // aligned to 32 bytes - mov sp, ip - - // r0: Input state matrix, s - // r1: 4 data blocks output, o - // r2: 4 data blocks input, i - // r3: nrounds - - // - // This function encrypts four consecutive ChaCha blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. The words are re-interleaved before the - // final addition of the original state and the XORing step. - // - - // x0..15[0-3] = s0..15[0-3] - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - adr lr, .Lctrinc - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] - vld1.32 {q4}, [lr, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vdup.32 q11, d5[1] - vdup.32 q10, d5[0] - vadd.u32 q12, q12, q4 // x12 += counter values 0-3 - vdup.32 q9, d4[1] - vdup.32 q8, d4[0] - vdup.32 q7, d3[1] - vdup.32 q6, d3[0] - vdup.32 q5, d2[1] - vdup.32 q4, d2[0] - vdup.32 q3, d1[1] - vdup.32 q2, d1[0] - vdup.32 q1, d0[1] - vdup.32 q0, d0[0] - - adr ip, .Lrol8_table - b 1f - -.Ldoubleround4: - vld1.32 {q8-q9}, [sp, :256] -1: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - vrev32.16 q15, q15 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #12 - vshl.u32 q5, q9, #12 - vsri.u32 q4, q8, #20 - vsri.u32 q5, q9, #20 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #12 - vshl.u32 q7, q9, #12 - vsri.u32 q6, q8, #20 - vsri.u32 q7, q9, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #7 - vshl.u32 q5, q9, #7 - vsri.u32 q4, q8, #25 - vsri.u32 q5, q9, #25 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #7 - vshl.u32 q7, q9, #7 - vsri.u32 q6, q8, #25 - vsri.u32 q7, q9, #25 - - vld1.32 {q8-q9}, [sp, :256] - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vrev32.16 q15, q15 - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #12 - vshl.u32 q4, q9, #12 - vsri.u32 q7, q8, #20 - vsri.u32 q4, q9, #20 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #12 - vshl.u32 q6, q9, #12 - vsri.u32 q5, q8, #20 - vsri.u32 q6, q9, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #7 - vshl.u32 q4, q9, #7 - vsri.u32 q7, q8, #25 - vsri.u32 q4, q9, #25 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #7 - vshl.u32 q6, q9, #7 - vsri.u32 q5, q8, #25 - vsri.u32 q6, q9, #25 - - subs r3, r3, #2 - bne .Ldoubleround4 - - // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. - // x8..9[0-3] are on the stack. - - // Re-interleave the words in the first two rows of each block (x0..7). - // Also add the counter values 0-3 to x12[0-3]. - vld1.32 {q8}, [lr, :128] // load counter values 0-3 - vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) - vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) - vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) - vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) - vadd.u32 q12, q8 // x12 += counter values 0-3 - vswp d1, d4 - vswp d3, d6 - vld1.32 {q8-q9}, [r0]! // load s0..7 - vswp d9, d12 - vswp d11, d14 - - // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) - // after XORing the first 32 bytes. - vswp q1, q4 - - // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) - - // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) - vadd.u32 q0, q0, q8 - vadd.u32 q2, q2, q8 - vadd.u32 q4, q4, q8 - vadd.u32 q3, q3, q8 - - // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) - vadd.u32 q1, q1, q9 - vadd.u32 q6, q6, q9 - vadd.u32 q5, q5, q9 - vadd.u32 q7, q7, q9 - - // XOR first 32 bytes using keystream from first two rows of first block - vld1.8 {q8-q9}, [r2]! - veor q8, q8, q0 - veor q9, q9, q1 - vst1.8 {q8-q9}, [r1]! - - // Re-interleave the words in the last two rows of each block (x8..15). - vld1.32 {q8-q9}, [sp, :256] - mov sp, r4 // restore original stack pointer - ldr r4, [r4, #8] // load number of bytes - vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) - vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) - vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) - vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) - vld1.32 {q0-q1}, [r0] // load s8..15 - vswp d25, d28 - vswp d27, d30 - vswp d17, d20 - vswp d19, d22 - - // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) - - // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) - vadd.u32 q8, q8, q0 - vadd.u32 q10, q10, q0 - vadd.u32 q9, q9, q0 - vadd.u32 q11, q11, q0 - - // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) - vadd.u32 q12, q12, q1 - vadd.u32 q14, q14, q1 - vadd.u32 q13, q13, q1 - vadd.u32 q15, q15, q1 - - // XOR the rest of the data with the keystream - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #96 - veor q0, q0, q8 - veor q1, q1, q12 - ble .Lle96 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q2 - veor q1, q1, q6 - ble .Lle128 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q10 - veor q1, q1, q14 - ble .Lle160 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q4 - veor q1, q1, q5 - ble .Lle192 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q9 - veor q1, q1, q13 - ble .Lle224 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q3 - veor q1, q1, q7 - blt .Llt256 -.Lout: - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - - pop {r4, pc} - -.Lle192: - vmov q4, q9 - vmov q5, q13 - -.Lle160: - // nothing to do - -.Lfinalblock: - // Process the final block if processing less than 4 full blocks. - // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the - // previous 32 byte output block that still needs to be written at - // [r1] in q0-q1. - beq .Lfullblock - -.Lpartialblock: - adr lr, .Lpermute + 32 - add r2, r2, r4 - add lr, lr, r4 - add r4, r4, r1 - - vld1.8 {q2-q3}, [lr] - vld1.8 {q6-q7}, [r2] - - add r4, r4, #32 - - vtbl.8 d4, {q4-q5}, d4 - vtbl.8 d5, {q4-q5}, d5 - vtbl.8 d6, {q4-q5}, d6 - vtbl.8 d7, {q4-q5}, d7 - - veor q6, q6, q2 - veor q7, q7, q3 - - vst1.8 {q6-q7}, [r4] // overlapping stores - vst1.8 {q0-q1}, [r1] - pop {r4, pc} - -.Lfullblock: - vmov q11, q4 - vmov q15, q5 - b .Lout -.Lle96: - vmov q4, q2 - vmov q5, q6 - b .Lfinalblock -.Lle128: - vmov q4, q10 - vmov q5, q14 - b .Lfinalblock -.Lle224: - vmov q4, q3 - vmov q5, q7 - b .Lfinalblock -.Llt256: - vmov q4, q11 - vmov q5, q15 - b .Lpartialblock -ENDPROC(chacha_4block_xor_neon) - - .align L1_CACHE_SHIFT -.Lpermute: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S deleted file mode 100644 index 4951df05c158..000000000000 --- a/arch/arm/lib/crypto/chacha-scalar-core.S +++ /dev/null @@ -1,444 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2018 Google, Inc. - */ - -#include -#include - -/* - * Design notes: - * - * 16 registers would be needed to hold the state matrix, but only 14 are - * available because 'sp' and 'pc' cannot be used. So we spill the elements - * (x8, x9) to the stack and swap them out with (x10, x11). This adds one - * 'ldrd' and one 'strd' instruction per round. - * - * All rotates are performed using the implicit rotate operand accepted by the - * 'add' and 'eor' instructions. This is faster than using explicit rotate - * instructions. To make this work, we allow the values in the second and last - * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the - * wrong rotation amount. The rotation amount is then fixed up just in time - * when the values are used. 'brot' is the number of bits the values in row 'b' - * need to be rotated right to arrive at the correct values, and 'drot' - * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such - * that they end up as (25, 24) after every round. - */ - - // ChaCha state registers - X0 .req r0 - X1 .req r1 - X2 .req r2 - X3 .req r3 - X4 .req r4 - X5 .req r5 - X6 .req r6 - X7 .req r7 - X8_X10 .req r8 // shared by x8 and x10 - X9_X11 .req r9 // shared by x9 and x11 - X12 .req r10 - X13 .req r11 - X14 .req r12 - X15 .req r14 - -.macro _le32_bswap_4x a, b, c, d, tmp -#ifdef __ARMEB__ - rev_l \a, \tmp - rev_l \b, \tmp - rev_l \c, \tmp - rev_l \d, \tmp -#endif -.endm - -.macro __ldrd a, b, src, offset -#if __LINUX_ARM_ARCH__ >= 6 - ldrd \a, \b, [\src, #\offset] -#else - ldr \a, [\src, #\offset] - ldr \b, [\src, #\offset + 4] -#endif -.endm - -.macro __strd a, b, dst, offset -#if __LINUX_ARM_ARCH__ >= 6 - strd \a, \b, [\dst, #\offset] -#else - str \a, [\dst, #\offset] - str \b, [\dst, #\offset + 4] -#endif -.endm - -.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 - - // a += b; d ^= a; d = rol(d, 16); - add \a1, \a1, \b1, ror #brot - add \a2, \a2, \b2, ror #brot - eor \d1, \a1, \d1, ror #drot - eor \d2, \a2, \d2, ror #drot - // drot == 32 - 16 == 16 - - // c += d; b ^= c; b = rol(b, 12); - add \c1, \c1, \d1, ror #16 - add \c2, \c2, \d2, ror #16 - eor \b1, \c1, \b1, ror #brot - eor \b2, \c2, \b2, ror #brot - // brot == 32 - 12 == 20 - - // a += b; d ^= a; d = rol(d, 8); - add \a1, \a1, \b1, ror #20 - add \a2, \a2, \b2, ror #20 - eor \d1, \a1, \d1, ror #16 - eor \d2, \a2, \d2, ror #16 - // drot == 32 - 8 == 24 - - // c += d; b ^= c; b = rol(b, 7); - add \c1, \c1, \d1, ror #24 - add \c2, \c2, \d2, ror #24 - eor \b1, \c1, \b1, ror #20 - eor \b2, \c2, \b2, ror #20 - // brot == 32 - 7 == 25 -.endm - -.macro _doubleround - - // column round - - // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) - _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 - - // save (x8, x9); restore (x10, x11) - __strd X8_X10, X9_X11, sp, 0 - __ldrd X8_X10, X9_X11, sp, 8 - - // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) - _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 - - .set brot, 25 - .set drot, 24 - - // diagonal round - - // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) - _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 - - // save (x10, x11); restore (x8, x9) - __strd X8_X10, X9_X11, sp, 8 - __ldrd X8_X10, X9_X11, sp, 0 - - // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) - _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 -.endm - -.macro _chacha_permute nrounds - .set brot, 0 - .set drot, 0 - .rept \nrounds / 2 - _doubleround - .endr -.endm - -.macro _chacha nrounds - -.Lnext_block\@: - // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - - // Do the core ChaCha permutation to update x0-x15. - _chacha_permute \nrounds - - add sp, #8 - // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). - push {X8_X10, X9_X11, X12, X13, X14, X15} - - // Load (OUT, IN, LEN). - ldr r14, [sp, #96] - ldr r12, [sp, #100] - ldr r11, [sp, #104] - - orr r10, r14, r12 - - // Use slow path if fewer than 64 bytes remain. - cmp r11, #64 - blt .Lxor_slowpath\@ - - // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on - // ARMv6+, since ldmia and stmia (used below) still require alignment. - tst r10, #3 - bne .Lxor_slowpath\@ - - // Fast path: XOR 64 bytes of aligned data. - - // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // x0-x3 - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8 - ldmia r12!, {r8-r11} - eor X0, X0, r8 - eor X1, X1, r9 - eor X2, X2, r10 - eor X3, X3, r11 - stmia r14!, {X0-X3} - - // x4-x7 - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - ldmia r12!, {X0-X3} - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8 - eor X4, X4, X0 - eor X5, X5, X1 - eor X6, X6, X2 - eor X7, X7, X3 - stmia r14!, {X4-X7} - - // x8-x15 - pop {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8 - ldmia r12!, {r8-r11} - eor r0, r0, r8 // x8 - eor r1, r1, r9 // x9 - eor r6, r6, r10 // x10 - eor r7, r7, r11 // x11 - stmia r14!, {r0,r1,r6,r7} - ldmia r12!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9 - ldr r9, [sp, #72] // load LEN - eor r2, r2, r0 // x12 - eor r3, r3, r1 // x13 - eor r4, r4, r6 // x14 - eor r5, r5, r7 // x15 - subs r9, #64 // decrement and check LEN - stmia r14!, {r2-r5} - - beq .Ldone\@ - -.Lprepare_for_next_block\@: - - // Stack: x0-x15 OUT IN LEN - - // Increment block counter (x12) - add r8, #1 - - // Store updated (OUT, IN, LEN) - str r14, [sp, #64] - str r12, [sp, #68] - str r9, [sp, #72] - - mov r14, sp - - // Store updated block counter (x12) - str r8, [sp, #48] - - sub sp, #16 - - // Reload state and do next block - ldmia r14!, {r0-r11} // load x0-x11 - __strd r10, r11, sp, 8 // store x10-x11 before state - ldmia r14, {r10-r12,r14} // load x12-x15 - b .Lnext_block\@ - -.Lxor_slowpath\@: - // Slow path: < 64 bytes remaining, or unaligned input or output buffer. - // We handle it by storing the 64 bytes of keystream to the stack, then - // XOR-ing the needed portion with the data. - - // Allocate keystream buffer - sub sp, #64 - mov r14, sp - - // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Save keystream for x0-x3 - __ldrd r8, r9, sp, 96 - __ldrd r10, r11, sp, 104 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8 - stmia r14!, {X0-X3} - - // Save keystream for x4-x7 - __ldrd r8, r9, sp, 112 - __ldrd r10, r11, sp, 120 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8 - add r8, sp, #64 - stmia r14!, {X4-X7} - - // Save keystream for x8-x15 - ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 128 - __ldrd r10, r11, sp, 136 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8 - stmia r14!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 144 - __ldrd r10, r11, sp, 152 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9 - stmia r14, {r2-r5} - - // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN - // Registers: r8 is block counter, r12 is IN. - - ldr r9, [sp, #168] // LEN - ldr r14, [sp, #160] // OUT - cmp r9, #64 - mov r0, sp - movle r1, r9 - movgt r1, #64 - // r1 is number of bytes to XOR, in range [1, 64] - -.if __LINUX_ARM_ARCH__ < 6 - orr r2, r12, r14 - tst r2, #3 // IN or OUT misaligned? - bne .Lxor_next_byte\@ -.endif - - // XOR a word at a time -.rept 16 - subs r1, #4 - blt .Lxor_words_done\@ - ldr r2, [r12], #4 - ldr r3, [r0], #4 - eor r2, r2, r3 - str r2, [r14], #4 -.endr - b .Lxor_slowpath_done\@ -.Lxor_words_done\@: - ands r1, r1, #3 - beq .Lxor_slowpath_done\@ - - // XOR a byte at a time -.Lxor_next_byte\@: - ldrb r2, [r12], #1 - ldrb r3, [r0], #1 - eor r2, r2, r3 - strb r2, [r14], #1 - subs r1, #1 - bne .Lxor_next_byte\@ - -.Lxor_slowpath_done\@: - subs r9, #64 - add sp, #96 - bgt .Lprepare_for_next_block\@ - -.Ldone\@: -.endm // _chacha - -/* - * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, - * const struct chacha_state *state, int nrounds); - */ -ENTRY(chacha_doarm) - cmp r2, #0 // len == 0? - reteq lr - - ldr ip, [sp] - cmp ip, #12 - - push {r0-r2,r4-r11,lr} - - // Push state x0-x15 onto stack. - // Also store an extra copy of x10-x11 just before the state. - - add X12, r3, #48 - ldm X12, {X12,X13,X14,X15} - push {X12,X13,X14,X15} - sub sp, sp, #64 - - __ldrd X8_X10, X9_X11, r3, 40 - __strd X8_X10, X9_X11, sp, 8 - __strd X8_X10, X9_X11, sp, 56 - ldm r3, {X0-X9_X11} - __strd X0, X1, sp, 16 - __strd X2, X3, sp, 24 - __strd X4, X5, sp, 32 - __strd X6, X7, sp, 40 - __strd X8_X10, X9_X11, sp, 48 - - beq 1f - _chacha 20 - -0: add sp, #76 - pop {r4-r11, pc} - -1: _chacha 12 - b 0b -ENDPROC(chacha_doarm) - -/* - * void hchacha_block_arm(const struct chacha_state *state, - * u32 out[HCHACHA_OUT_WORDS], int nrounds); - */ -ENTRY(hchacha_block_arm) - push {r1,r4-r11,lr} - - cmp r2, #12 // ChaCha12 ? - - mov r14, r0 - ldmia r14!, {r0-r11} // load x0-x11 - push {r10-r11} // store x10-x11 to stack - ldm r14, {r10-r12,r14} // load x12-x15 - sub sp, #8 - - beq 1f - _chacha_permute 20 - - // Skip over (unused0-unused1, x10-x11) -0: add sp, #16 - - // Fix up rotations of x12-x15 - ror X12, X12, #drot - ror X13, X13, #drot - pop {r4} // load 'out' - ror X14, X14, #drot - ror X15, X15, #drot - - // Store (x0-x3,x12-x15) to 'out' - stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} - - pop {r4-r11,pc} - -1: _chacha_permute 12 - b 0b -ENDPROC(hchacha_block_arm) diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl deleted file mode 100644 index d57c6e2fc84a..000000000000 --- a/arch/arm/lib/crypto/poly1305-armv4.pl +++ /dev/null @@ -1,1236 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# IALU(*)/gcc-4.4 NEON -# -# ARM11xx(ARMv6) 7.78/+100% - -# Cortex-A5 6.35/+130% 3.00 -# Cortex-A8 6.25/+115% 2.36 -# Cortex-A9 5.10/+95% 2.55 -# Cortex-A15 3.85/+85% 1.25(**) -# Snapdragon S4 5.70/+100% 1.48(**) -# -# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; -# (**) these are trade-off results, they can be improved by ~8% but at -# the cost of 15/12% regression on Cortex-A5/A7, it's even possible -# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; - -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arch -.globl poly1305_blocks_neon -#endif - -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.text - -.globl poly1305_emit -.globl poly1305_blocks -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: -.Lpoly1305_init: - stmdb sp!,{r4-r11} - - eor r3,r3,r3 - cmp $inp,#0 - str r3,[$ctx,#0] @ zero hash value - str r3,[$ctx,#4] - str r3,[$ctx,#8] - str r3,[$ctx,#12] - str r3,[$ctx,#16] - str r3,[$ctx,#36] @ clear is_base2_26 - add $ctx,$ctx,#20 - -#ifdef __thumb2__ - it eq -#endif - moveq r0,#0 - beq .Lno_key - -#if __ARM_MAX_ARCH__>=7 - mov r3,#-1 - str r3,[$ctx,#28] @ impossible key power value -# ifndef __KERNEL__ - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap -# endif -#endif - ldrb r4,[$inp,#0] - mov r10,#0x0fffffff - ldrb r5,[$inp,#1] - and r3,r10,#-4 @ 0x0ffffffc - ldrb r6,[$inp,#2] - ldrb r7,[$inp,#3] - orr r4,r4,r5,lsl#8 - ldrb r5,[$inp,#4] - orr r4,r4,r6,lsl#16 - ldrb r6,[$inp,#5] - orr r4,r4,r7,lsl#24 - ldrb r7,[$inp,#6] - and r4,r4,r10 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# if !defined(_WIN32) - ldr r12,[r11,r12] @ OPENSSL_armcap_P -# endif -# if defined(__APPLE__) || defined(_WIN32) - ldr r12,[r12] -# endif -#endif - ldrb r8,[$inp,#7] - orr r5,r5,r6,lsl#8 - ldrb r6,[$inp,#8] - orr r5,r5,r7,lsl#16 - ldrb r7,[$inp,#9] - orr r5,r5,r8,lsl#24 - ldrb r8,[$inp,#10] - and r5,r5,r3 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - tst r12,#ARMV7_NEON @ check for NEON -# ifdef __thumb2__ - adr r9,.Lpoly1305_blocks_neon - adr r11,.Lpoly1305_blocks - it ne - movne r11,r9 - adr r12,.Lpoly1305_emit - orr r11,r11,#1 @ thumb-ify addresses - orr r12,r12,#1 -# else - add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) - ite eq - addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) -# endif -#endif - ldrb r9,[$inp,#11] - orr r6,r6,r7,lsl#8 - ldrb r7,[$inp,#12] - orr r6,r6,r8,lsl#16 - ldrb r8,[$inp,#13] - orr r6,r6,r9,lsl#24 - ldrb r9,[$inp,#14] - and r6,r6,r3 - - ldrb r10,[$inp,#15] - orr r7,r7,r8,lsl#8 - str r4,[$ctx,#0] - orr r7,r7,r9,lsl#16 - str r5,[$ctx,#4] - orr r7,r7,r10,lsl#24 - str r6,[$ctx,#8] - and r7,r7,r3 - str r7,[$ctx,#12] -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 -#else - mov r0,#0 -#endif -.Lno_key: - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_init,.-poly1305_init -___ -{ -my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); -my ($s1,$s2,$s3)=($r1,$r2,$r3); - -$code.=<<___; -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - stmdb sp!,{r3-r11,lr} - - ands $len,$len,#-16 - beq .Lno_data - - add $len,$len,$inp @ end pointer - sub sp,sp,#32 - -#if __ARM_ARCH__<7 - ldmia $ctx,{$h0-$r3} @ load context - add $ctx,$ctx,#20 - str $len,[sp,#16] @ offload stuff - str $ctx,[sp,#12] -#else - ldr lr,[$ctx,#36] @ is_base2_26 - ldmia $ctx!,{$h0-$h4} @ load hash value - str $len,[sp,#16] @ offload stuff - str $ctx,[sp,#12] - - adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 - mov $r1,$h1,lsr#6 - adcs $r1,$r1,$h2,lsl#20 - mov $r2,$h2,lsr#12 - adcs $r2,$r2,$h3,lsl#14 - mov $r3,$h3,lsr#18 - adcs $r3,$r3,$h4,lsl#8 - mov $len,#0 - teq lr,#0 - str $len,[$ctx,#16] @ clear is_base2_26 - adc $len,$len,$h4,lsr#24 - - itttt ne - movne $h0,$r0 @ choose between radixes - movne $h1,$r1 - movne $h2,$r2 - movne $h3,$r3 - ldmia $ctx,{$r0-$r3} @ load key - it ne - movne $h4,$len -#endif - - mov lr,$inp - cmp $padbit,#0 - str $r1,[sp,#20] - str $r2,[sp,#24] - str $r3,[sp,#28] - b .Loop - -.align 4 -.Loop: -#if __ARM_ARCH__<7 - ldrb r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ 1<<128 - ldrb r1,[lr,#-15] - ldrb r2,[lr,#-14] - ldrb r3,[lr,#-13] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-12] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-11] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-10] - adds $h0,$h0,r3 @ accumulate input - - ldrb r3,[lr,#-9] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-8] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-7] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-6] - adcs $h1,$h1,r3 - - ldrb r3,[lr,#-5] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-4] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-3] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-2] - adcs $h2,$h2,r3 - - ldrb r3,[lr,#-1] - orr r1,r0,r1,lsl#8 - str lr,[sp,#8] @ offload input pointer - orr r2,r1,r2,lsl#16 - add $s1,$r1,$r1,lsr#2 - orr r3,r2,r3,lsl#24 -#else - ldr r0,[lr],#16 @ load input - it hi - addhi $h4,$h4,#1 @ padbit - ldr r1,[lr,#-12] - ldr r2,[lr,#-8] - ldr r3,[lr,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - adds $h0,$h0,r0 @ accumulate input - str lr,[sp,#8] @ offload input pointer - adcs $h1,$h1,r1 - add $s1,$r1,$r1,lsr#2 - adcs $h2,$h2,r2 -#endif - add $s2,$r2,$r2,lsr#2 - adcs $h3,$h3,r3 - add $s3,$r3,$r3,lsr#2 - - umull r2,r3,$h1,$r0 - adc $h4,$h4,#0 - umull r0,r1,$h0,$r0 - umlal r2,r3,$h4,$s1 - umlal r0,r1,$h3,$s1 - ldr $r1,[sp,#20] @ reload $r1 - umlal r2,r3,$h2,$s3 - umlal r0,r1,$h1,$s3 - umlal r2,r3,$h3,$s2 - umlal r0,r1,$h2,$s2 - umlal r2,r3,$h0,$r1 - str r0,[sp,#0] @ future $h0 - mul r0,$s2,$h4 - ldr $r2,[sp,#24] @ reload $r2 - adds r2,r2,r1 @ d1+=d0>>32 - eor r1,r1,r1 - adc lr,r3,#0 @ future $h2 - str r2,[sp,#4] @ future $h1 - - mul r2,$s3,$h4 - eor r3,r3,r3 - umlal r0,r1,$h3,$s3 - ldr $r3,[sp,#28] @ reload $r3 - umlal r2,r3,$h3,$r0 - umlal r0,r1,$h2,$r0 - umlal r2,r3,$h2,$r1 - umlal r0,r1,$h1,$r1 - umlal r2,r3,$h1,$r2 - umlal r0,r1,$h0,$r2 - umlal r2,r3,$h0,$r3 - ldr $h0,[sp,#0] - mul $h4,$r0,$h4 - ldr $h1,[sp,#4] - - adds $h2,lr,r0 @ d2+=d1>>32 - ldr lr,[sp,#8] @ reload input pointer - adc r1,r1,#0 - adds $h3,r2,r1 @ d3+=d2>>32 - ldr r0,[sp,#16] @ reload end pointer - adc r3,r3,#0 - add $h4,$h4,r3 @ h4+=d3>>32 - - and r1,$h4,#-4 - and $h4,$h4,#3 - add r1,r1,r1,lsr#2 @ *=5 - adds $h0,$h0,r1 - adcs $h1,$h1,#0 - adcs $h2,$h2,#0 - adcs $h3,$h3,#0 - adc $h4,$h4,#0 - - cmp r0,lr @ done yet? - bhi .Loop - - ldr $ctx,[sp,#12] - add sp,sp,#32 - stmdb $ctx,{$h0-$h4} @ store the result - -.Lno_data: -#if __ARM_ARCH__>=5 - ldmia sp!,{r3-r11,pc} -#else - ldmia sp!,{r3-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_blocks,.-poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce)=map("r$_",(0..2)); -my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); -my $g4=$ctx; - -$code.=<<___; -.type poly1305_emit,%function -.align 5 -poly1305_emit: -.Lpoly1305_emit: - stmdb sp!,{r4-r11} - - ldmia $ctx,{$h0-$h4} - -#if __ARM_ARCH__>=7 - ldr ip,[$ctx,#36] @ is_base2_26 - - adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 - mov $g1,$h1,lsr#6 - adcs $g1,$g1,$h2,lsl#20 - mov $g2,$h2,lsr#12 - adcs $g2,$g2,$h3,lsl#14 - mov $g3,$h3,lsr#18 - adcs $g3,$g3,$h4,lsl#8 - mov $g4,#0 - adc $g4,$g4,$h4,lsr#24 - - tst ip,ip - itttt ne - movne $h0,$g0 - movne $h1,$g1 - movne $h2,$g2 - movne $h3,$g3 - it ne - movne $h4,$g4 -#endif - - adds $g0,$h0,#5 @ compare to modulus - adcs $g1,$h1,#0 - adcs $g2,$h2,#0 - adcs $g3,$h3,#0 - adc $g4,$h4,#0 - tst $g4,#4 @ did it carry/borrow? - -#ifdef __thumb2__ - it ne -#endif - movne $h0,$g0 - ldr $g0,[$nonce,#0] -#ifdef __thumb2__ - it ne -#endif - movne $h1,$g1 - ldr $g1,[$nonce,#4] -#ifdef __thumb2__ - it ne -#endif - movne $h2,$g2 - ldr $g2,[$nonce,#8] -#ifdef __thumb2__ - it ne -#endif - movne $h3,$g3 - ldr $g3,[$nonce,#12] - - adds $h0,$h0,$g0 - adcs $h1,$h1,$g1 - adcs $h2,$h2,$g2 - adc $h3,$h3,$g3 - -#if __ARM_ARCH__>=7 -# ifdef __ARMEB__ - rev $h0,$h0 - rev $h1,$h1 - rev $h2,$h2 - rev $h3,$h3 -# endif - str $h0,[$mac,#0] - str $h1,[$mac,#4] - str $h2,[$mac,#8] - str $h3,[$mac,#12] -#else - strb $h0,[$mac,#0] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#4] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#8] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#12] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#1] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#5] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#9] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#13] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#2] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#6] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#10] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#14] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#3] - strb $h1,[$mac,#7] - strb $h2,[$mac,#11] - strb $h3,[$mac,#15] -#endif - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_emit,.-poly1305_emit -___ -{ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); -my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); -my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); - -my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.fpu neon - -.type poly1305_init_neon,%function -.align 5 -poly1305_init_neon: -.Lpoly1305_init_neon: - ldr r3,[$ctx,#48] @ first table element - cmp r3,#-1 @ is value impossible? - bne .Lno_init_neon - - ldr r4,[$ctx,#20] @ load key base 2^32 - ldr r5,[$ctx,#24] - ldr r6,[$ctx,#28] - ldr r7,[$ctx,#32] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - and r3,r3,#0x03ffffff - and r4,r4,#0x03ffffff - and r5,r5,#0x03ffffff - - vdup.32 $R0,r2 @ r^1 in both lanes - add r2,r3,r3,lsl#2 @ *5 - vdup.32 $R1,r3 - add r3,r4,r4,lsl#2 - vdup.32 $S1,r2 - vdup.32 $R2,r4 - add r4,r5,r5,lsl#2 - vdup.32 $S2,r3 - vdup.32 $R3,r5 - add r5,r6,r6,lsl#2 - vdup.32 $S3,r4 - vdup.32 $R4,r6 - vdup.32 $S4,r5 - - mov $zeros,#2 @ counter - -.Lsquare_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - - vmull.u32 $D0,$R0,${R0}[1] - vmull.u32 $D1,$R1,${R0}[1] - vmull.u32 $D2,$R2,${R0}[1] - vmull.u32 $D3,$R3,${R0}[1] - vmull.u32 $D4,$R4,${R0}[1] - - vmlal.u32 $D0,$R4,${S1}[1] - vmlal.u32 $D1,$R0,${R1}[1] - vmlal.u32 $D2,$R1,${R1}[1] - vmlal.u32 $D3,$R2,${R1}[1] - vmlal.u32 $D4,$R3,${R1}[1] - - vmlal.u32 $D0,$R3,${S2}[1] - vmlal.u32 $D1,$R4,${S2}[1] - vmlal.u32 $D3,$R1,${R2}[1] - vmlal.u32 $D2,$R0,${R2}[1] - vmlal.u32 $D4,$R2,${R2}[1] - - vmlal.u32 $D0,$R2,${S3}[1] - vmlal.u32 $D3,$R0,${R3}[1] - vmlal.u32 $D1,$R3,${S3}[1] - vmlal.u32 $D2,$R4,${S3}[1] - vmlal.u32 $D4,$R1,${R3}[1] - - vmlal.u32 $D3,$R4,${S4}[1] - vmlal.u32 $D0,$R1,${S4}[1] - vmlal.u32 $D1,$R2,${S4}[1] - vmlal.u32 $D2,$R3,${S4}[1] - vmlal.u32 $D4,$R0,${R4}[1] - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - @ and P. Schwabe - @ - @ H0>>+H1>>+H2>>+H3>>+H4 - @ H3>>+H4>>*5+H0>>+H1 - @ - @ Trivia. - @ - @ Result of multiplication of n-bit number by m-bit number is - @ n+m bits wide. However! Even though 2^n is a n+1-bit number, - @ m-bit number multiplied by 2^n is still n+m bits wide. - @ - @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, - @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit - @ one is n+1 bits wide. - @ - @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that - @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 - @ can be 27. However! In cases when their width exceeds 26 bits - @ they are limited by 2^26+2^6. This in turn means that *sum* - @ of the products with these values can still be viewed as sum - @ of 52-bit numbers as long as the amount of addends is not a - @ power of 2. For example, - @ - @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, - @ - @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or - @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than - @ 8 * (2^52) or 2^55. However, the value is then multiplied by - @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), - @ which is less than 32 * (2^52) or 2^57. And when processing - @ data we are looking at triple as many addends... - @ - @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and - @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the - @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while - @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 - @ instruction accepts 2x32-bit input and writes 2x64-bit result. - @ This means that result of reduction have to be compressed upon - @ loop wrap-around. This can be done in the process of reduction - @ to minimize amount of instructions [as well as amount of - @ 128-bit instructions, which benefits low-end processors], but - @ one has to watch for H2 (which is narrower than H0) and 5*H4 - @ not being wider than 58 bits, so that result of right shift - @ by 26 bits fits in 32 bits. This is also useful on x86, - @ because it allows to use paddd in place for paddq, which - @ benefits Atom, where paddq is ridiculously slow. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vbic.i32 $D4#lo,#0xfc000000 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vbic.i32 $D2#lo,#0xfc000000 - - vshr.u32 $T0#lo,$D0#lo,#26 - vbic.i32 $D0#lo,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - - subs $zeros,$zeros,#1 - beq .Lsquare_break_neon - - add $tbl0,$ctx,#(48+0*9*4) - add $tbl1,$ctx,#(48+1*9*4) - - vtrn.32 $R0,$D0#lo @ r^2:r^1 - vtrn.32 $R2,$D2#lo - vtrn.32 $R3,$D3#lo - vtrn.32 $R1,$D1#lo - vtrn.32 $R4,$D4#lo - - vshl.u32 $S2,$R2,#2 @ *5 - vshl.u32 $S3,$R3,#2 - vshl.u32 $S1,$R1,#2 - vshl.u32 $S4,$R4,#2 - vadd.i32 $S2,$S2,$R2 - vadd.i32 $S1,$S1,$R1 - vadd.i32 $S3,$S3,$R3 - vadd.i32 $S4,$S4,$R4 - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0,:32] - vst1.32 {${S4}[1]},[$tbl1,:32] - - b .Lsquare_neon - -.align 4 -.Lsquare_break_neon: - add $tbl0,$ctx,#(48+2*4*9) - add $tbl1,$ctx,#(48+3*4*9) - - vmov $R0,$D0#lo @ r^4:r^3 - vshl.u32 $S1,$D1#lo,#2 @ *5 - vmov $R1,$D1#lo - vshl.u32 $S2,$D2#lo,#2 - vmov $R2,$D2#lo - vshl.u32 $S3,$D3#lo,#2 - vmov $R3,$D3#lo - vshl.u32 $S4,$D4#lo,#2 - vmov $R4,$D4#lo - vadd.i32 $S1,$S1,$D1#lo - vadd.i32 $S2,$S2,$D2#lo - vadd.i32 $S3,$S3,$D3#lo - vadd.i32 $S4,$S4,$D4#lo - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0] - vst1.32 {${S4}[1]},[$tbl1] - -.Lno_init_neon: - ret @ bx lr -.size poly1305_init_neon,.-poly1305_init_neon - -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: -.Lpoly1305_blocks_neon: - ldr ip,[$ctx,#36] @ is_base2_26 - - cmp $len,#64 - blo .Lpoly1305_blocks - - stmdb sp!,{r4-r7} - vstmdb sp!,{d8-d15} @ ABI specification says so - - tst ip,ip @ is_base2_26? - bne .Lbase2_26_neon - - stmdb sp!,{r1-r3,lr} - bl .Lpoly1305_init_neon - - ldr r4,[$ctx,#0] @ load hash value base 2^32 - ldr r5,[$ctx,#4] - ldr r6,[$ctx,#8] - ldr r7,[$ctx,#12] - ldr ip,[$ctx,#16] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - veor $D0#lo,$D0#lo,$D0#lo - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - veor $D1#lo,$D1#lo,$D1#lo - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - veor $D2#lo,$D2#lo,$D2#lo - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - veor $D3#lo,$D3#lo,$D3#lo - and r3,r3,#0x03ffffff - orr r6,r6,ip,lsl#24 - veor $D4#lo,$D4#lo,$D4#lo - and r4,r4,#0x03ffffff - mov r1,#1 - and r5,r5,#0x03ffffff - str r1,[$ctx,#36] @ set is_base2_26 - - vmov.32 $D0#lo[0],r2 - vmov.32 $D1#lo[0],r3 - vmov.32 $D2#lo[0],r4 - vmov.32 $D3#lo[0],r5 - vmov.32 $D4#lo[0],r6 - adr $zeros,.Lzeros - - ldmia sp!,{r1-r3,lr} - b .Lhash_loaded - -.align 4 -.Lbase2_26_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ load hash value - - veor $D0#lo,$D0#lo,$D0#lo - veor $D1#lo,$D1#lo,$D1#lo - veor $D2#lo,$D2#lo,$D2#lo - veor $D3#lo,$D3#lo,$D3#lo - veor $D4#lo,$D4#lo,$D4#lo - vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - adr $zeros,.Lzeros - vld1.32 {$D4#lo[0]},[$ctx] - sub $ctx,$ctx,#16 @ rewind - -.Lhash_loaded: - add $in2,$inp,#32 - mov $padbit,$padbit,lsl#24 - tst $len,#31 - beq .Leven - - vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! - vmov.32 $H4#lo[0],$padbit - sub $len,$len,#16 - add $in2,$inp,#32 - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3#lo,$H3#lo,#18 - - vsri.u32 $H3#lo,$H2#lo,#14 - vshl.u32 $H2#lo,$H2#lo,#12 - vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi - - vbic.i32 $H3#lo,#0xfc000000 - vsri.u32 $H2#lo,$H1#lo,#20 - vshl.u32 $H1#lo,$H1#lo,#6 - - vbic.i32 $H2#lo,#0xfc000000 - vsri.u32 $H1#lo,$H0#lo,#26 - vadd.i32 $H3#hi,$H3#lo,$D3#lo - - vbic.i32 $H0#lo,#0xfc000000 - vbic.i32 $H1#lo,#0xfc000000 - vadd.i32 $H2#hi,$H2#lo,$D2#lo - - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - - mov $tbl1,$zeros - add $tbl0,$ctx,#48 - - cmp $len,$len - b .Long_tail - -.align 4 -.Leven: - subs $len,$len,#64 - it lo - movlo $in2,$zeros - - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - itt hi - addhi $tbl1,$ctx,#(48+1*9*4) - addhi $tbl0,$ctx,#(48+3*9*4) - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3,$H3,#18 - - vsri.u32 $H3,$H2,#14 - vshl.u32 $H2,$H2,#12 - - vbic.i32 $H3,#0xfc000000 - vsri.u32 $H2,$H1,#20 - vshl.u32 $H1,$H1,#6 - - vbic.i32 $H2,#0xfc000000 - vsri.u32 $H1,$H0,#26 - - vbic.i32 $H0,#0xfc000000 - vbic.i32 $H1,#0xfc000000 - - bls .Lskip_loop - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - b .Loop_neon - -.align 5 -.Loop_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - @ \___________________/ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - @ \___________________/ \____________________/ - @ - @ Note that we start with inp[2:3]*r^2. This is because it - @ doesn't depend on reduction in previous iteration. - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ inp[2:3]*r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] - vmull.u32 $D2,$H2#hi,${R0}[1] - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,${R0}[1] - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,${R0}[1] - vmlal.u32 $D2,$H1#hi,${R1}[1] - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,${R0}[1] - - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,${R0}[1] - subs $len,$len,#64 - vmlal.u32 $D0,$H4#hi,${S1}[1] - it lo - movlo $in2,$zeros - vmlal.u32 $D3,$H2#hi,${R1}[1] - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D1,$H0#hi,${R1}[1] - vmlal.u32 $D4,$H3#hi,${R1}[1] - - vmlal.u32 $D0,$H3#hi,${S2}[1] - vmlal.u32 $D3,$H1#hi,${R2}[1] - vmlal.u32 $D4,$H2#hi,${R2}[1] - vmlal.u32 $D1,$H4#hi,${S2}[1] - vmlal.u32 $D2,$H0#hi,${R2}[1] - - vmlal.u32 $D3,$H0#hi,${R3}[1] - vmlal.u32 $D0,$H2#hi,${S3}[1] - vmlal.u32 $D4,$H1#hi,${R3}[1] - vmlal.u32 $D1,$H3#hi,${S3}[1] - vmlal.u32 $D2,$H4#hi,${S3}[1] - - vmlal.u32 $D3,$H4#hi,${S4}[1] - vmlal.u32 $D0,$H1#hi,${S4}[1] - vmlal.u32 $D4,$H0#hi,${R4}[1] - vmlal.u32 $D1,$H2#hi,${S4}[1] - vmlal.u32 $D2,$H3#hi,${S4}[1] - - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4 and accumulate - - vmlal.u32 $D3,$H3#lo,${R0}[0] - vmlal.u32 $D0,$H0#lo,${R0}[0] - vmlal.u32 $D4,$H4#lo,${R0}[0] - vmlal.u32 $D1,$H1#lo,${R0}[0] - vmlal.u32 $D2,$H2#lo,${R0}[0] - vld1.32 ${S4}[0],[$tbl0,:32] - - vmlal.u32 $D3,$H2#lo,${R1}[0] - vmlal.u32 $D0,$H4#lo,${S1}[0] - vmlal.u32 $D4,$H3#lo,${R1}[0] - vmlal.u32 $D1,$H0#lo,${R1}[0] - vmlal.u32 $D2,$H1#lo,${R1}[0] - - vmlal.u32 $D3,$H1#lo,${R2}[0] - vmlal.u32 $D0,$H3#lo,${S2}[0] - vmlal.u32 $D4,$H2#lo,${R2}[0] - vmlal.u32 $D1,$H4#lo,${S2}[0] - vmlal.u32 $D2,$H0#lo,${R2}[0] - - vmlal.u32 $D3,$H0#lo,${R3}[0] - vmlal.u32 $D0,$H2#lo,${S3}[0] - vmlal.u32 $D4,$H1#lo,${R3}[0] - vmlal.u32 $D1,$H3#lo,${S3}[0] - vmlal.u32 $D3,$H4#lo,${S4}[0] - - vmlal.u32 $D2,$H4#lo,${S3}[0] - vmlal.u32 $D0,$H1#lo,${S4}[0] - vmlal.u32 $D4,$H0#lo,${R4}[0] - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vmlal.u32 $D1,$H2#lo,${S4}[0] - vmlal.u32 $D2,$H3#lo,${S4}[0] - - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 - vrev32.8 $H3,$H3 -# endif - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction interleaved with base 2^32 -> base 2^26 of - @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vshl.u32 $H3,$H3,#18 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vsri.u32 $H3,$H2,#14 - vbic.i32 $D4#lo,#0xfc000000 - vshl.u32 $H2,$H2,#12 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vbic.i32 $H3,#0xfc000000 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] - vsri.u32 $H2,$H1,#20 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vshl.u32 $H1,$H1,#6 - vbic.i32 $D2#lo,#0xfc000000 - vbic.i32 $H2,#0xfc000000 - - vshrn.u64 $T0#lo,$D0,#26 @ re-narrow - vmovn.i64 $D0#lo,$D0 - vsri.u32 $H1,$H0,#26 - vbic.i32 $H0,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vbic.i32 $D0#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - vbic.i32 $H1,#0xfc000000 - - bhi .Loop_neon - -.Lskip_loop: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - add $tbl1,$ctx,#(48+0*9*4) - add $tbl0,$ctx,#(48+1*9*4) - adds $len,$len,#32 - it ne - movne $len,#0 - bne .Long_tail - - vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H3#hi,$H3#lo,$D3#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - vadd.i32 $H4#hi,$H4#lo,$D4#lo - -.Long_tail: - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant - vmull.u32 $D2,$H2#hi,$R0 - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,$R0 - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,$R0 - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,$R0 - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,$R0 - - vmlal.u32 $D0,$H4#hi,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#hi,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#hi,$R1 - vmlal.u32 $D4,$H3#hi,$R1 - vmlal.u32 $D2,$H1#hi,$R1 - - vmlal.u32 $D3,$H1#hi,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#hi,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#hi,$R2 - vmlal.u32 $D1,$H4#hi,$S2 - vmlal.u32 $D2,$H0#hi,$R2 - - vmlal.u32 $D3,$H0#hi,$R3 - it ne - addne $tbl1,$ctx,#(48+2*9*4) - vmlal.u32 $D0,$H2#hi,$S3 - it ne - addne $tbl0,$ctx,#(48+3*9*4) - vmlal.u32 $D4,$H1#hi,$R3 - vmlal.u32 $D1,$H3#hi,$S3 - vmlal.u32 $D2,$H4#hi,$S3 - - vmlal.u32 $D3,$H4#hi,$S4 - vorn $MASK,$MASK,$MASK @ all-ones, can be redundant - vmlal.u32 $D0,$H1#hi,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#hi,$R4 - vmlal.u32 $D1,$H2#hi,$S4 - vmlal.u32 $D2,$H3#hi,$S4 - - beq .Lshort_tail - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4:r^3 and accumulate - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - - vmlal.u32 $D2,$H2#lo,$R0 - vmlal.u32 $D0,$H0#lo,$R0 - vmlal.u32 $D3,$H3#lo,$R0 - vmlal.u32 $D1,$H1#lo,$R0 - vmlal.u32 $D4,$H4#lo,$R0 - - vmlal.u32 $D0,$H4#lo,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#lo,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#lo,$R1 - vmlal.u32 $D4,$H3#lo,$R1 - vmlal.u32 $D2,$H1#lo,$R1 - - vmlal.u32 $D3,$H1#lo,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#lo,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#lo,$R2 - vmlal.u32 $D1,$H4#lo,$S2 - vmlal.u32 $D2,$H0#lo,$R2 - - vmlal.u32 $D3,$H0#lo,$R3 - vmlal.u32 $D0,$H2#lo,$S3 - vmlal.u32 $D4,$H1#lo,$R3 - vmlal.u32 $D1,$H3#lo,$S3 - vmlal.u32 $D2,$H4#lo,$S3 - - vmlal.u32 $D3,$H4#lo,$S4 - vorn $MASK,$MASK,$MASK @ all-ones - vmlal.u32 $D0,$H1#lo,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#lo,$R4 - vmlal.u32 $D1,$H2#lo,$S4 - vmlal.u32 $D2,$H3#lo,$S4 - -.Lshort_tail: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 $D3#lo,$D3#lo,$D3#hi - vadd.i64 $D0#lo,$D0#lo,$D0#hi - vadd.i64 $D4#lo,$D4#lo,$D4#hi - vadd.i64 $D1#lo,$D1#lo,$D1#hi - vadd.i64 $D2#lo,$D2#lo,$D2#hi - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction, but without narrowing - - vshr.u64 $T0,$D3,#26 - vand.i64 $D3,$D3,$MASK - vshr.u64 $T1,$D0,#26 - vand.i64 $D0,$D0,$MASK - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - - vshr.u64 $T0,$D4,#26 - vand.i64 $D4,$D4,$MASK - vshr.u64 $T1,$D1,#26 - vand.i64 $D1,$D1,$MASK - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - - vadd.i64 $D0,$D0,$T0 - vshl.u64 $T0,$T0,#2 - vshr.u64 $T1,$D2,#26 - vand.i64 $D2,$D2,$MASK - vadd.i64 $D0,$D0,$T0 @ h4 -> h0 - vadd.i64 $D3,$D3,$T1 @ h2 -> h3 - - vshr.u64 $T0,$D0,#26 - vand.i64 $D0,$D0,$MASK - vshr.u64 $T1,$D3,#26 - vand.i64 $D3,$D3,$MASK - vadd.i64 $D1,$D1,$T0 @ h0 -> h1 - vadd.i64 $D4,$D4,$T1 @ h3 -> h4 - - cmp $len,#0 - bne .Leven - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ store hash value - - vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - vst1.32 {$D4#lo[0]},[$ctx] - - vldmia sp!,{d8-d15} @ epilogue - ldmia sp!,{r4-r7} - ret @ bx lr -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -#ifndef __KERNEL__ -.LOPENSSL_armcap: -# ifdef _WIN32 -.word OPENSSL_armcap_P -# else -.word OPENSSL_armcap_P-.Lpoly1305_init -# endif -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -___ -} } -$code.=<<___; -.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" -.align 2 -___ - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - - print $_,"\n"; -} -close STDOUT; # enforce flush diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c deleted file mode 100644 index 2603b0771f2c..000000000000 --- a/arch/arm/lib/crypto/poly1305-glue.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -void __weak poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit) -{ -} - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks_arm(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init arm_poly1305_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - (elf_hwcap & HWCAP_NEON)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(arm_poly1305_mod_init); - -static void __exit arm_poly1305_mod_exit(void) -{ -} -module_exit(arm_poly1305_mod_exit); - -MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl deleted file mode 100644 index 8122db7fd599..000000000000 --- a/arch/arm/lib/crypto/sha256-armv4.pl +++ /dev/null @@ -1,724 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# SHA256 block procedure for ARMv4. May 2007. - -# Performance is ~2x better than gcc 3.4 generated code and in "abso- -# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -# byte [on single-issue Xscale PXA250 core]. - -# July 2010. -# -# Rescheduling for dual-issue pipeline resulted in 22% improvement on -# Cortex A8 core and ~20 cycles per processed byte. - -# February 2011. -# -# Profiler-assisted and platform-specific optimization resulted in 16% -# improvement on Cortex A8 core and ~15.4 cycles per processed byte. - -# September 2013. -# -# Add NEON implementation. On Cortex A8 it was measured to process one -# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon -# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only -# code (meaning that latter performs sub-optimally, nothing was done -# about it). - -# May 2014. -# -# Add ARMv8 code path performing at 2.0 cpb on Apple A7. - -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -$ctx="r0"; $t0="r0"; -$inp="r1"; $t4="r1"; -$len="r2"; $t1="r2"; -$T1="r3"; $t3="r3"; -$A="r4"; -$B="r5"; -$C="r6"; -$D="r7"; -$E="r8"; -$F="r9"; -$G="r10"; -$H="r11"; -@V=($A,$B,$C,$D,$E,$F,$G,$H); -$t2="r12"; -$Ktbl="r14"; - -@Sigma0=( 2,13,22); -@Sigma1=( 6,11,25); -@sigma0=( 7,18, 3); -@sigma1=(17,19,10); - -sub BODY_00_15 { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___ if ($i<16); -#if __ARM_ARCH__>=7 - @ ldr $t1,[$inp],#4 @ $i -# if $i==15 - str $inp,[sp,#17*4] @ make room for $t4 -# endif - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) -# ifndef __ARMEB__ - rev $t1,$t1 -# endif -#else - @ ldrb $t1,[$inp,#3] @ $i - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - ldrb $t2,[$inp,#2] - ldrb $t0,[$inp,#1] - orr $t1,$t1,$t2,lsl#8 - ldrb $t2,[$inp],#4 - orr $t1,$t1,$t0,lsl#16 -# if $i==15 - str $inp,[sp,#17*4] @ make room for $t4 -# endif - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` - orr $t1,$t1,$t2,lsl#24 - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) -#endif -___ -$code.=<<___; - ldr $t2,[$Ktbl],#4 @ *K256++ - add $h,$h,$t1 @ h+=X[i] - str $t1,[sp,#`$i%16`*4] - eor $t1,$f,$g - add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) - and $t1,$t1,$e - add $h,$h,$t2 @ h+=K256[i] - eor $t1,$t1,$g @ Ch(e,f,g) - eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` - add $h,$h,$t1 @ h+=Ch(e,f,g) -#if $i==31 - and $t2,$t2,#0xff - cmp $t2,#0xf2 @ done? -#endif -#if $i<15 -# if __ARM_ARCH__>=7 - ldr $t1,[$inp],#4 @ prefetch -# else - ldrb $t1,[$inp,#3] -# endif - eor $t2,$a,$b @ a^b, b^c in next round -#else - ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx - eor $t2,$a,$b @ a^b, b^c in next round - ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx -#endif - eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) - and $t3,$t3,$t2 @ (b^c)&=(a^b) - add $d,$d,$h @ d+=h - eor $t3,$t3,$b @ Maj(a,b,c) - add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) - @ add $h,$h,$t3 @ h+=Maj(a,b,c) -___ - ($t2,$t3)=($t3,$t2); -} - -sub BODY_16_XX { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___; - @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i - @ ldr $t4,[sp,#`($i+14)%16`*4] - mov $t0,$t1,ror#$sigma0[0] - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - mov $t2,$t4,ror#$sigma1[0] - eor $t0,$t0,$t1,ror#$sigma0[1] - eor $t2,$t2,$t4,ror#$sigma1[1] - eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) - ldr $t1,[sp,#`($i+0)%16`*4] - eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) - ldr $t4,[sp,#`($i+9)%16`*4] - - add $t2,$t2,$t0 - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 - add $t1,$t1,$t2 - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) - add $t1,$t1,$t4 @ X[i] -___ - &BODY_00_15(@_); -} - -$code=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -.text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else -.code 32 -# endif -#endif - -.type K256,%object -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.size K256,.-K256 -.word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_blocks_arch -#endif -.align 5 - -.global sha256_blocks_arch -.type sha256_blocks_arch,%function -sha256_blocks_arch: -.Lsha256_blocks_arch: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ sha256_blocks_arch -#else - adr r3,.Lsha256_blocks_arch -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#ARMV8_SHA256 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} - ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} - sub $Ktbl,r3,#256+32 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -.Loop: -# if __ARM_ARCH__>=7 - ldr $t1,[$inp],#4 -# else - ldrb $t1,[$inp,#3] -# endif - eor $t3,$B,$C @ magic - eor $t2,$t2,$t2 -___ -for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } -$code.=".Lrounds_16_xx:\n"; -for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; -#if __ARM_ARCH__>=7 - ite eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq $t3,[sp,#16*4] @ pull ctx - bne .Lrounds_16_xx - - add $A,$A,$t2 @ h+=Maj(a,b,c) from the past - ldr $t0,[$t3,#0] - ldr $t1,[$t3,#4] - ldr $t2,[$t3,#8] - add $A,$A,$t0 - ldr $t0,[$t3,#12] - add $B,$B,$t1 - ldr $t1,[$t3,#16] - add $C,$C,$t2 - ldr $t2,[$t3,#20] - add $D,$D,$t0 - ldr $t0,[$t3,#24] - add $E,$E,$t1 - ldr $t1,[$t3,#28] - add $F,$F,$t2 - ldr $inp,[sp,#17*4] @ pull inp - ldr $t2,[sp,#18*4] @ pull inp+len - add $G,$G,$t0 - add $H,$H,$t1 - stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} - cmp $inp,$t2 - sub $Ktbl,$Ktbl,#256 @ rewind Ktbl - bne .Loop - - add sp,sp,#`16+3`*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r11,pc} -#else - ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size sha256_blocks_arch,.-sha256_blocks_arch -___ -###################################################################### -# NEON stuff -# -{{{ -my @X=map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); -my $Xfer=$t4; -my $j=0; - -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vld1_32 ("{$T0}","[$Ktbl,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 ($T0,$T0,@X[0]); - while($#insns>=2) { eval(shift(@insns)); } - &vst1_32 ("{$T0}","[$Xfer,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vld1_32 ("{$T0}","[$Ktbl,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vrev32_8 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &vst1_32 ("{$T0}","[$Xfer,:128]!"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&eor ($t1,$f,$g)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&and ($t1,$t1,$e)', - '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&eor ($t1,$t1,$g)', # Ch(e,f,g) - '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&ldr ($t1,"[sp,#64]") if ($j==31)', - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&add ($d,$d,$h)', # d+=h - '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.global sha256_block_data_order_neon -.type sha256_block_data_order_neon,%function -.align 4 -sha256_block_data_order_neon: -.LNEON: - stmdb sp!,{r4-r12,lr} - - sub $H,sp,#16*4+16 - adr $Ktbl,.Lsha256_blocks_arch - sub $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256 - bic $H,$H,#15 @ align for 128-bit stores - mov $t2,sp - mov sp,$H @ alloca - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - - vld1.8 {@X[0]},[$inp]! - vld1.8 {@X[1]},[$inp]! - vld1.8 {@X[2]},[$inp]! - vld1.8 {@X[3]},[$inp]! - vld1.32 {$T0},[$Ktbl,:128]! - vld1.32 {$T1},[$Ktbl,:128]! - vld1.32 {$T2},[$Ktbl,:128]! - vld1.32 {$T3},[$Ktbl,:128]! - vrev32.8 @X[0],@X[0] @ yes, even on - str $ctx,[sp,#64] - vrev32.8 @X[1],@X[1] @ big-endian - str $inp,[sp,#68] - mov $Xfer,sp - vrev32.8 @X[2],@X[2] - str $len,[sp,#72] - vrev32.8 @X[3],@X[3] - str $t2,[sp,#76] @ save original sp - vadd.i32 $T0,$T0,@X[0] - vadd.i32 $T1,$T1,@X[1] - vst1.32 {$T0},[$Xfer,:128]! - vadd.i32 $T2,$T2,@X[2] - vst1.32 {$T1},[$Xfer,:128]! - vadd.i32 $T3,$T3,@X[3] - vst1.32 {$T2},[$Xfer,:128]! - vst1.32 {$T3},[$Xfer,:128]! - - ldmia $ctx,{$A-$H} - sub $Xfer,$Xfer,#64 - ldr $t1,[sp,#0] - eor $t2,$t2,$t2 - eor $t3,$B,$C - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - teq $t1,#0 @ check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - ldr $inp,[sp,#68] - ldr $t0,[sp,#72] - sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl - teq $inp,$t0 - it eq - subeq $inp,$inp,#64 @ avoid SEGV - vld1.8 {@X[0]},[$inp]! @ load next input block - vld1.8 {@X[1]},[$inp]! - vld1.8 {@X[2]},[$inp]! - vld1.8 {@X[3]},[$inp]! - it ne - strne $inp,[sp,#68] - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - ldr $t0,[$t1,#0] - add $A,$A,$t2 @ h+=Maj(a,b,c) from the past - ldr $t2,[$t1,#4] - ldr $t3,[$t1,#8] - ldr $t4,[$t1,#12] - add $A,$A,$t0 @ accumulate - ldr $t0,[$t1,#16] - add $B,$B,$t2 - ldr $t2,[$t1,#20] - add $C,$C,$t3 - ldr $t3,[$t1,#24] - add $D,$D,$t4 - ldr $t4,[$t1,#28] - add $E,$E,$t0 - str $A,[$t1],#4 - add $F,$F,$t2 - str $B,[$t1],#4 - add $G,$G,$t3 - str $C,[$t1],#4 - add $H,$H,$t4 - str $D,[$t1],#4 - stmia $t1,{$E-$H} - - ittte ne - movne $Xfer,sp - ldrne $t1,[sp,#0] - eorne $t2,$t2,$t2 - ldreq sp,[sp,#76] @ restore original sp - itt ne - eorne $t3,$B,$C - bne .L_00_48 - - ldmia sp!,{r4-r12,pc} -.size sha256_block_data_order_neon,.-sha256_block_data_order_neon -#endif -___ -}}} -###################################################################### -# ARMv8 stuff -# -{{{ -my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); -my @MSG=map("q$_",(8..11)); -my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); -my $Ktbl="r3"; - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - -# ifdef __thumb2__ -# define INST(a,b,c,d) .byte c,d|0xc,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d -# endif - -.type sha256_block_data_order_armv8,%function -.align 5 -sha256_block_data_order_armv8: -.LARMv8: - vld1.32 {$ABCD,$EFGH},[$ctx] -# ifdef __thumb2__ - adr $Ktbl,.LARMv8 - sub $Ktbl,$Ktbl,#.LARMv8-K256 -# else - adrl $Ktbl,K256 -# endif - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - -.Loop_v8: - vld1.8 {@MSG[0]-@MSG[1]},[$inp]! - vld1.8 {@MSG[2]-@MSG[3]},[$inp]! - vld1.32 {$W0},[$Ktbl]! - vrev32.8 @MSG[0],@MSG[0] - vrev32.8 @MSG[1],@MSG[1] - vrev32.8 @MSG[2],@MSG[2] - vrev32.8 @MSG[3],@MSG[3] - vmov $ABCD_SAVE,$ABCD @ offload - vmov $EFGH_SAVE,$EFGH - teq $inp,$len -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - vld1.32 {$W1},[$Ktbl]! - vadd.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - vld1.32 {$W1},[$Ktbl]! - vadd.i32 $W0,$W0,@MSG[0] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - vld1.32 {$W0},[$Ktbl]! - vadd.i32 $W1,$W1,@MSG[1] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - vld1.32 {$W1},[$Ktbl] - vadd.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#256-16 @ rewind - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - vadd.i32 $W1,$W1,@MSG[3] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - vadd.i32 $ABCD,$ABCD,$ABCD_SAVE - vadd.i32 $EFGH,$EFGH,$EFGH_SAVE - it ne - bne .Loop_v8 - - vst1.32 {$ABCD,$EFGH},[$ctx] - - ret @ bx lr -.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 -#endif -___ -}}} -$code.=<<___; -.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -open SELF,$0; -while() { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - -{ my %opcode = ( - "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, - "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { - my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<17)|(($2&8)<<4) - |(($3&7)<<1) |(($3&8)<<2); - # since ARMv7 instructions are always encoded little-endian. - # correct solution is to use .inst directive, but older - # assemblers don't implement it:-( - sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", - $word&0xff,($word>>8)&0xff, - ($word>>16)&0xff,($word>>24)&0xff, - $mnemonic,$arg; - } - } -} - -foreach (split($/,$code)) { - - s/\`([^\`]*)\`/eval $1/geo; - - s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; - - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - - print $_,"\n"; -} - -close STDOUT; # enforce flush diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S deleted file mode 100644 index ac2c9b01b22d..000000000000 --- a/arch/arm/lib/crypto/sha256-ce.S +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd. - * Author: Ard Biesheuvel - */ - -#include -#include - - .text - .arch armv8-a - .fpu crypto-neon-fp-armv8 - - k0 .req q7 - k1 .req q8 - rk .req r3 - - ta0 .req q9 - ta1 .req q10 - tb0 .req q10 - tb1 .req q9 - - dga .req q11 - dgb .req q12 - - dg0 .req q13 - dg1 .req q14 - dg2 .req q15 - - .macro add_only, ev, s0 - vmov dg2, dg0 - .ifnb \s0 - vld1.32 {k\ev}, [rk, :128]! - .endif - sha256h.32 dg0, dg1, tb\ev - sha256h2.32 dg1, dg2, tb\ev - .ifnb \s0 - vadd.u32 ta\ev, q\s0, k\ev - .endif - .endm - - .macro add_update, ev, s0, s1, s2, s3 - sha256su0.32 q\s0, q\s1 - add_only \ev, \s1 - sha256su1.32 q\s0, q\s2, q\s3 - .endm - - .align 6 -.Lsha256_rcon: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - - /* - * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ -ENTRY(sha256_ce_transform) - /* load state */ - vld1.32 {dga-dgb}, [r0] - - /* load input */ -0: vld1.32 {q0-q1}, [r1]! - vld1.32 {q2-q3}, [r1]! - subs r2, r2, #1 - -#ifndef CONFIG_CPU_BIG_ENDIAN - vrev32.8 q0, q0 - vrev32.8 q1, q1 - vrev32.8 q2, q2 - vrev32.8 q3, q3 -#endif - - /* load first round constant */ - adr rk, .Lsha256_rcon - vld1.32 {k0}, [rk, :128]! - - vadd.u32 ta0, q0, k0 - vmov dg0, dga - vmov dg1, dgb - - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - - add_only 1, 1 - add_only 0, 2 - add_only 1, 3 - add_only 0 - - /* update state */ - vadd.u32 dga, dga, dg0 - vadd.u32 dgb, dgb, dg1 - bne 0b - - /* store new state */ - vst1.32 {dga-dgb}, [r0] - bx lr -ENDPROC(sha256_ce_transform) diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c deleted file mode 100644 index 109192e54b0f..000000000000 --- a/arch/arm/lib/crypto/sha256.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for ARM - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include - -asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -EXPORT_SYMBOL_GPL(sha256_blocks_arch); -asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { - kernel_neon_begin(); - if (static_branch_likely(&have_ce)) - sha256_ce_transform(state, data, nblocks); - else - sha256_block_data_order_neon(state, data, nblocks); - kernel_neon_end(); - } else { - sha256_blocks_arch(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -bool sha256_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_arm_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { - static_branch_enable(&have_neon); - if (elf_hwcap2 & HWCAP2_SHA2) - static_branch_enable(&have_ce); - } - return 0; -} -subsys_initcall(sha256_arm_mod_init); - -static void __exit sha256_arm_mod_exit(void) -{ -} -module_exit(sha256_arm_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for ARM"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index dce127a69f13..e14bef8e87af 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -190,7 +190,7 @@ config CRYPTO_LIB_SM3 if !KMSAN # avoid false positives from assembly if ARM -source "arch/arm/lib/crypto/Kconfig" +source "lib/crypto/arm/Kconfig" endif if ARM64 source "arch/arm64/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index aaf445a85384..5f2b81f82a85 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -106,3 +106,5 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o + +obj-$(CONFIG_ARM) += arm/ diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore index 670a4d97b568..f6c4e8ef80da 100644 --- a/lib/crypto/arm/.gitignore +++ b/lib/crypto/arm/.gitignore @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig new file mode 100644 index 000000000000..d1ad664f0c67 --- /dev/null +++ b/lib/crypto/arm/Kconfig @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_ARM + bool "Hash functions: BLAKE2s" + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: arm + + This is faster than the generic implementations of BLAKE2s and + BLAKE2b, but slower than the NEON implementation of BLAKE2b. + There is no NEON implementation of BLAKE2s, since NEON doesn't + really help with it. + +config CRYPTO_CHACHA20_NEON + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_ARM + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + +config CRYPTO_SHA256_ARM + tristate + depends on !CPU_V7M + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile new file mode 100644 index 000000000000..431f77c3ff6f --- /dev/null +++ b/lib/crypto/arm/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o +libblake2s-arm-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-scalar-core.o chacha-glue.o +chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o + +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o +poly1305-arm-y := poly1305-core.o poly1305-glue.o + +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o +sha256-arm-y := sha256.o sha256-core.o +sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o + +quiet_cmd_perl = PERL $@ + cmd_perl = $(PERL) $(<) > $(@) + +$(obj)/%-core.S: $(src)/%-armv4.pl + $(call cmd,perl) + +clean-files += poly1305-core.S sha256-core.S + +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) + +AFLAGS_sha256-core.o += $(aflags-thumb2-y) diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S new file mode 100644 index 000000000000..df40e46601f1 --- /dev/null +++ b/lib/crypto/arm/blake2s-core.S @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2s digest algorithm, ARM scalar implementation + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers + */ + +#include +#include + + // Registers used to hold message words temporarily. There aren't + // enough ARM registers to hold the whole message block, so we have to + // load the words on-demand. + M_0 .req r12 + M_1 .req r14 + +// The BLAKE2s initialization vector +.Lblake2s_IV: + .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _le32_bswap a, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp +#endif +.endm + +.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp + _le32_bswap \a, \tmp + _le32_bswap \b, \tmp + _le32_bswap \c, \tmp + _le32_bswap \d, \tmp + _le32_bswap \e, \tmp + _le32_bswap \f, \tmp + _le32_bswap \g, \tmp + _le32_bswap \h, \tmp +.endm + +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two +// columns/diagonals. s0-s1 are the word offsets to the message words the first +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. +// M_0 and M_1 are free to use, and the message block can be found at sp + 32. +// +// Note that to save instructions, the rotations don't happen when the +// pseudocode says they should, but rather they are delayed until the values are +// used. See the comment above _blake2s_round(). +.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 + + ldr M_0, [sp, #32 + 4 * \s0] + ldr M_1, [sp, #32 + 4 * \s2] + + // a += b + m[blake2s_sigma[r][2*i + 0]]; + add \a0, \a0, \b0, ror #brot + add \a1, \a1, \b1, ror #brot + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 16); + eor \d0, \a0, \d0, ror #drot + eor \d1, \a1, \d1, ror #drot + + // c += d; + add \c0, \c0, \d0, ror #16 + add \c1, \c1, \d1, ror #16 + + // b = ror32(b ^ c, 12); + eor \b0, \c0, \b0, ror #brot + eor \b1, \c1, \b1, ror #brot + + ldr M_0, [sp, #32 + 4 * \s1] + ldr M_1, [sp, #32 + 4 * \s3] + + // a += b + m[blake2s_sigma[r][2*i + 1]]; + add \a0, \a0, \b0, ror #12 + add \a1, \a1, \b1, ror #12 + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 8); + eor \d0, \a0, \d0, ror#16 + eor \d1, \a1, \d1, ror#16 + + // c += d; + add \c0, \c0, \d0, ror#8 + add \c1, \c1, \d1, ror#8 + + // b = ror32(b ^ c, 7); + eor \b0, \c0, \b0, ror#12 + eor \b1, \c1, \b1, ror#12 +.endm + +// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] +// are in r0..r9. The stack pointer points to 8 bytes of scratch space for +// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and +// r14 are free to use. The macro arguments s0-s15 give the order in which the +// message words are used in this round. +// +// All rotates are performed using the implicit rotate operand accepted by the +// 'add' and 'eor' instructions. This is faster than using explicit rotate +// instructions. To make this work, we allow the values in the second and last +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the +// wrong rotation amount. The rotation amount is then fixed up just in time +// when the values are used. 'brot' is the number of bits the values in row 'b' +// need to be rotated right to arrive at the correct values, and 'drot' +// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such +// that they end up as (7, 8) after every round. +.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15 + + // Mix first two columns: + // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). + __ldrd r10, r11, sp, 16 // load v[12] and v[13] + _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ + \s0, \s1, \s2, \s3 + __strd r8, r9, sp, 0 + __strd r10, r11, sp, 16 + + // Mix second two columns: + // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). + __ldrd r8, r9, sp, 8 // load v[10] and v[11] + __ldrd r10, r11, sp, 24 // load v[14] and v[15] + _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ + \s4, \s5, \s6, \s7 + str r10, [sp, #24] // store v[14] + // v[10], v[11], and v[15] are used below, so no need to store them yet. + + .set brot, 7 + .set drot, 8 + + // Mix first two diagonals: + // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). + ldr r10, [sp, #16] // load v[12] + _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ + \s8, \s9, \s10, \s11 + __strd r8, r9, sp, 8 + str r11, [sp, #28] + str r10, [sp, #16] + + // Mix second two diagonals: + // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). + __ldrd r8, r9, sp, 0 // load v[8] and v[9] + __ldrd r10, r11, sp, 20 // load v[13] and v[14] + _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ + \s12, \s13, \s14, \s15 + __strd r10, r11, sp, 20 +.endm + +// +// void blake2s_compress(struct blake2s_state *state, +// const u8 *block, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_state are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// + .align 5 +ENTRY(blake2s_compress) + push {r0-r2,r4-r11,lr} // keep this an even number + +.Lnext_block: + // r0 is 'state' + // r1 is 'block' + // r3 is 'inc' + + // Load and increment the counter t[0..1]. + __ldrd r10, r11, r0, 32 + adds r10, r10, r3 + adc r11, r11, #0 + __strd r10, r11, r0, 32 + + // _blake2s_round is very short on registers, so copy the message block + // to the stack to save a register during the rounds. This also has the + // advantage that misalignment only needs to be dealt with in one place. + sub sp, sp, #64 + mov r12, sp + tst r1, #3 + bne .Lcopy_block_misaligned + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12!, {r2-r9} + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12, {r2-r9} +.Lcopy_block_done: + str r1, [sp, #68] // Update message pointer + + // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space + // for spilling v[8..9]. Leave v[8..9] in r8-r9. + mov r14, r0 // r14 = state + adr r12, .Lblake2s_IV + ldmia r12!, {r8-r9} // load IV[0..1] + __ldrd r0, r1, r14, 40 // load f[0..1] + ldm r12, {r2-r7} // load IV[3..7] + eor r4, r4, r10 // v[12] = IV[4] ^ t[0] + eor r5, r5, r11 // v[13] = IV[5] ^ t[1] + eor r6, r6, r0 // v[14] = IV[6] ^ f[0] + eor r7, r7, r1 // v[15] = IV[7] ^ f[1] + push {r2-r7} // push v[9..15] + sub sp, sp, #8 // leave space for v[8..9] + + // Load h[0..7] == v[0..7]. + ldm r14, {r0-r7} + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + .set brot, 0 + .set drot, 0 + _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + ldr r14, [sp, #96] // r14 = &h[0] + add sp, sp, #8 // v[8..9] are already loaded. + pop {r10-r11} // load v[10..11] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + ldm r14, {r8-r11} // load h[0..3] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + stmia r14!, {r0-r3} // store new h[0..3] + ldm r14, {r0-r3} // load old h[4..7] + pop {r8-r11} // load v[12..15] + eor r0, r0, r4, ror #brot + eor r1, r1, r5, ror #brot + eor r2, r2, r6, ror #brot + eor r3, r3, r7, ror #brot + eor r0, r0, r8, ror #drot + eor r1, r1, r9, ror #drot + eor r2, r2, r10, ror #drot + eor r3, r3, r11, ror #drot + add sp, sp, #64 // skip copy of message block + stm r14, {r0-r3} // store new h[4..7] + + // Advance to the next block, if there is one. Note that if there are + // multiple blocks, then 'inc' (the counter increment amount) must be + // 64. So we can simply set it to 64 without re-loading it. + ldm sp, {r0, r1, r2} // load (state, block, nblocks) + mov r3, #64 // set 'inc' + subs r2, r2, #1 // nblocks-- + str r2, [sp, #8] + bne .Lnext_block // nblocks != 0? + + pop {r0-r2,r4-r11,pc} + + // The next message block (pointed to by r1) isn't 4-byte aligned, so it + // can't be loaded using ldmia. Copy it to the stack buffer (pointed to + // by r12) using an alternative method. r2-r9 are free to use. +.Lcopy_block_misaligned: + mov r2, #64 +1: +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + ldr r3, [r1], #4 + _le32_bswap r3, r4 +#else + ldrb r3, [r1, #0] + ldrb r4, [r1, #1] + ldrb r5, [r1, #2] + ldrb r6, [r1, #3] + add r1, r1, #4 + orr r3, r3, r4, lsl #8 + orr r3, r3, r5, lsl #16 + orr r3, r3, r6, lsl #24 +#endif + subs r2, r2, #4 + str r3, [r12], #4 + bne 1b + b .Lcopy_block_done +ENDPROC(blake2s_compress) diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c new file mode 100644 index 000000000000..0238a70d9581 --- /dev/null +++ b/lib/crypto/arm/blake2s-glue.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +/* defined in blake2s-core.S */ +EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c new file mode 100644 index 000000000000..88ec96415283 --- /dev/null +++ b/lib/crypto/arm/chacha-glue.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (ARM optimized) + * + * Copyright (C) 2016-2019 Linaro, Ltd. + * Copyright (C) 2015 Martin Willi + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, unsigned int nbytes); +asmlinkage void hchacha_block_arm(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + const struct chacha_state *state, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); + +static inline bool neon_usable(void) +{ + return static_branch_likely(&use_neon) && crypto_simd_usable(); +} + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes > CHACHA_BLOCK_SIZE) { + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); + + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } + if (bytes) { + const u8 *s = src; + u8 *d = dst; + + if (bytes != CHACHA_BLOCK_SIZE) + s = d = memcpy(buf, src, bytes); + chacha_block_xor_neon(state, d, s, nrounds); + if (d != dst) + memcpy(dst, buf, bytes); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { + hchacha_block_arm(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || + bytes <= CHACHA_BLOCK_SIZE) { + chacha_doarm(dst, src, bytes, state, nrounds); + state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); + return; + } + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_arm_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A7: + case ARM_CPU_PART_CORTEX_A5: + /* + * The Cortex-A7 and Cortex-A5 do not perform well with + * the NEON implementation but do incredibly with the + * scalar one and use less power. + */ + break; + default: + static_branch_enable(&use_neon); + } + } + return 0; +} +subsys_initcall(chacha_arm_mod_init); + +static void __exit chacha_arm_mod_exit(void) +{ +} +module_exit(chacha_arm_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S new file mode 100644 index 000000000000..ddd62b6294a5 --- /dev/null +++ b/lib/crypto/arm/chacha-neon-core.S @@ -0,0 +1,643 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, + * the only choices are (a) and (b). We use (a) since it takes two-thirds the + * cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + +#include +#include + + .text + .fpu neon + .align 5 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in r3. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha_permute: + + adr ip, .Lrol8_table + vld1.8 {d10}, [ip, :64] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vext.8 q1, q1, q1, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vext.8 q3, q3, q3, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vext.8 q1, q1, q1, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vext.8 q3, q3, q3, #4 + + subs r3, r3, #2 + bne .Ldoubleround + + bx lr +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + // r3: nrounds + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha_permute + + add ip, r2, #0x20 + vld1.8 {q4-q5}, [r2] + vld1.8 {q6-q7}, [ip] + + // o0 = i0 ^ (x0 + s0) + vadd.i32 q0, q0, q8 + veor q0, q0, q4 + + // o1 = i1 ^ (x1 + s1) + vadd.i32 q1, q1, q9 + veor q1, q1, q5 + + // o2 = i2 ^ (x2 + s2) + vadd.i32 q2, q2, q10 + veor q2, q2, q6 + + // o3 = i3 ^ (x3 + s3) + vadd.i32 q3, q3, q11 + veor q3, q3, q7 + + add ip, r1, #0x20 + vst1.8 {q0-q1}, [r1] + vst1.8 {q2-q3}, [ip] + + pop {pc} +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + // r2: nrounds + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + mov r3, r2 + bl chacha_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha_block_neon) + + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + + .align 5 +ENTRY(chacha_4block_xor_neon) + push {r4, lr} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip + + // r0: Input state matrix, s + // r1: 4 data blocks output, o + // r2: 4 data blocks input, i + // r3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. + // + + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + adr lr, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] + vld1.32 {q4}, [lr, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] + vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 + vdup.32 q9, d4[1] + vdup.32 q8, d4[0] + vdup.32 q7, d3[1] + vdup.32 q6, d3[0] + vdup.32 q5, d2[1] + vdup.32 q4, d2[0] + vdup.32 q3, d1[1] + vdup.32 q2, d1[0] + vdup.32 q1, d0[1] + vdup.32 q0, d0[0] + + adr ip, .Lrol8_table + b 1f + +.Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + vrev32.16 q15, q15 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #12 + vshl.u32 q5, q9, #12 + vsri.u32 q4, q8, #20 + vsri.u32 q5, q9, #20 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #12 + vshl.u32 q7, q9, #12 + vsri.u32 q6, q8, #20 + vsri.u32 q7, q9, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #7 + vshl.u32 q5, q9, #7 + vsri.u32 q4, q8, #25 + vsri.u32 q5, q9, #25 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #7 + vshl.u32 q7, q9, #7 + vsri.u32 q6, q8, #25 + vsri.u32 q7, q9, #25 + + vld1.32 {q8-q9}, [sp, :256] + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vrev32.16 q15, q15 + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #12 + vshl.u32 q4, q9, #12 + vsri.u32 q7, q8, #20 + vsri.u32 q4, q9, #20 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #12 + vshl.u32 q6, q9, #12 + vsri.u32 q5, q8, #20 + vsri.u32 q6, q9, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #7 + vshl.u32 q4, q9, #7 + vsri.u32 q7, q8, #25 + vsri.u32 q4, q9, #25 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #7 + vshl.u32 q6, q9, #7 + vsri.u32 q5, q8, #25 + vsri.u32 q6, q9, #25 + + subs r3, r3, #2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [lr, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 + vswp d1, d4 + vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 + vswp d9, d12 + vswp d11, d14 + + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block + vld1.8 {q8-q9}, [r2]! + veor q8, q8, q0 + veor q9, q9, q1 + vst1.8 {q8-q9}, [r1]! + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] + mov sp, r4 // restore original stack pointer + ldr r4, [r4, #8] // load number of bytes + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 + vswp d25, d28 + vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 + + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #96 + veor q0, q0, q8 + veor q1, q1, q12 + ble .Lle96 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q2 + veor q1, q1, q6 + ble .Lle128 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q10 + veor q1, q1, q14 + ble .Lle160 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q4 + veor q1, q1, q5 + ble .Lle192 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q9 + veor q1, q1, q13 + ble .Lle224 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q3 + veor q1, q1, q7 + blt .Llt256 +.Lout: + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + + pop {r4, pc} + +.Lle192: + vmov q4, q9 + vmov q5, q13 + +.Lle160: + // nothing to do + +.Lfinalblock: + // Process the final block if processing less than 4 full blocks. + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the + // previous 32 byte output block that still needs to be written at + // [r1] in q0-q1. + beq .Lfullblock + +.Lpartialblock: + adr lr, .Lpermute + 32 + add r2, r2, r4 + add lr, lr, r4 + add r4, r4, r1 + + vld1.8 {q2-q3}, [lr] + vld1.8 {q6-q7}, [r2] + + add r4, r4, #32 + + vtbl.8 d4, {q4-q5}, d4 + vtbl.8 d5, {q4-q5}, d5 + vtbl.8 d6, {q4-q5}, d6 + vtbl.8 d7, {q4-q5}, d7 + + veor q6, q6, q2 + veor q7, q7, q3 + + vst1.8 {q6-q7}, [r4] // overlapping stores + vst1.8 {q0-q1}, [r1] + pop {r4, pc} + +.Lfullblock: + vmov q11, q4 + vmov q15, q5 + b .Lout +.Lle96: + vmov q4, q2 + vmov q5, q6 + b .Lfinalblock +.Lle128: + vmov q4, q10 + vmov q5, q14 + b .Lfinalblock +.Lle224: + vmov q4, q3 + vmov q5, q7 + b .Lfinalblock +.Llt256: + vmov q4, q11 + vmov q5, q15 + b .Lpartialblock +ENDPROC(chacha_4block_xor_neon) + + .align L1_CACHE_SHIFT +.Lpermute: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S new file mode 100644 index 000000000000..4951df05c158 --- /dev/null +++ b/lib/crypto/arm/chacha-scalar-core.S @@ -0,0 +1,444 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Google, Inc. + */ + +#include +#include + +/* + * Design notes: + * + * 16 registers would be needed to hold the state matrix, but only 14 are + * available because 'sp' and 'pc' cannot be used. So we spill the elements + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one + * 'ldrd' and one 'strd' instruction per round. + * + * All rotates are performed using the implicit rotate operand accepted by the + * 'add' and 'eor' instructions. This is faster than using explicit rotate + * instructions. To make this work, we allow the values in the second and last + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the + * wrong rotation amount. The rotation amount is then fixed up just in time + * when the values are used. 'brot' is the number of bits the values in row 'b' + * need to be rotated right to arrive at the correct values, and 'drot' + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such + * that they end up as (25, 24) after every round. + */ + + // ChaCha state registers + X0 .req r0 + X1 .req r1 + X2 .req r2 + X3 .req r3 + X4 .req r4 + X5 .req r5 + X6 .req r6 + X7 .req r7 + X8_X10 .req r8 // shared by x8 and x10 + X9_X11 .req r9 // shared by x9 and x11 + X12 .req r10 + X13 .req r11 + X14 .req r12 + X15 .req r14 + +.macro _le32_bswap_4x a, b, c, d, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp + rev_l \b, \tmp + rev_l \c, \tmp + rev_l \d, \tmp +#endif +.endm + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 + + // a += b; d ^= a; d = rol(d, 16); + add \a1, \a1, \b1, ror #brot + add \a2, \a2, \b2, ror #brot + eor \d1, \a1, \d1, ror #drot + eor \d2, \a2, \d2, ror #drot + // drot == 32 - 16 == 16 + + // c += d; b ^= c; b = rol(b, 12); + add \c1, \c1, \d1, ror #16 + add \c2, \c2, \d2, ror #16 + eor \b1, \c1, \b1, ror #brot + eor \b2, \c2, \b2, ror #brot + // brot == 32 - 12 == 20 + + // a += b; d ^= a; d = rol(d, 8); + add \a1, \a1, \b1, ror #20 + add \a2, \a2, \b2, ror #20 + eor \d1, \a1, \d1, ror #16 + eor \d2, \a2, \d2, ror #16 + // drot == 32 - 8 == 24 + + // c += d; b ^= c; b = rol(b, 7); + add \c1, \c1, \d1, ror #24 + add \c2, \c2, \d2, ror #24 + eor \b1, \c1, \b1, ror #20 + eor \b2, \c2, \b2, ror #20 + // brot == 32 - 7 == 25 +.endm + +.macro _doubleround + + // column round + + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 + + // save (x8, x9); restore (x10, x11) + __strd X8_X10, X9_X11, sp, 0 + __ldrd X8_X10, X9_X11, sp, 8 + + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 + + .set brot, 25 + .set drot, 24 + + // diagonal round + + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 + + // save (x10, x11); restore (x8, x9) + __strd X8_X10, X9_X11, sp, 8 + __ldrd X8_X10, X9_X11, sp, 0 + + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 +.endm + +.macro _chacha_permute nrounds + .set brot, 0 + .set drot, 0 + .rept \nrounds / 2 + _doubleround + .endr +.endm + +.macro _chacha nrounds + +.Lnext_block\@: + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + + // Do the core ChaCha permutation to update x0-x15. + _chacha_permute \nrounds + + add sp, #8 + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). + push {X8_X10, X9_X11, X12, X13, X14, X15} + + // Load (OUT, IN, LEN). + ldr r14, [sp, #96] + ldr r12, [sp, #100] + ldr r11, [sp, #104] + + orr r10, r14, r12 + + // Use slow path if fewer than 64 bytes remain. + cmp r11, #64 + blt .Lxor_slowpath\@ + + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on + // ARMv6+, since ldmia and stmia (used below) still require alignment. + tst r10, #3 + bne .Lxor_slowpath\@ + + // Fast path: XOR 64 bytes of aligned data. + + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // x0-x3 + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + ldmia r12!, {r8-r11} + eor X0, X0, r8 + eor X1, X1, r9 + eor X2, X2, r10 + eor X3, X3, r11 + stmia r14!, {X0-X3} + + // x4-x7 + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + ldmia r12!, {X0-X3} + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + eor X4, X4, X0 + eor X5, X5, X1 + eor X6, X6, X2 + eor X7, X7, X3 + stmia r14!, {X4-X7} + + // x8-x15 + pop {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + ldmia r12!, {r8-r11} + eor r0, r0, r8 // x8 + eor r1, r1, r9 // x9 + eor r6, r6, r10 // x10 + eor r7, r7, r11 // x11 + stmia r14!, {r0,r1,r6,r7} + ldmia r12!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + ldr r9, [sp, #72] // load LEN + eor r2, r2, r0 // x12 + eor r3, r3, r1 // x13 + eor r4, r4, r6 // x14 + eor r5, r5, r7 // x15 + subs r9, #64 // decrement and check LEN + stmia r14!, {r2-r5} + + beq .Ldone\@ + +.Lprepare_for_next_block\@: + + // Stack: x0-x15 OUT IN LEN + + // Increment block counter (x12) + add r8, #1 + + // Store updated (OUT, IN, LEN) + str r14, [sp, #64] + str r12, [sp, #68] + str r9, [sp, #72] + + mov r14, sp + + // Store updated block counter (x12) + str r8, [sp, #48] + + sub sp, #16 + + // Reload state and do next block + ldmia r14!, {r0-r11} // load x0-x11 + __strd r10, r11, sp, 8 // store x10-x11 before state + ldmia r14, {r10-r12,r14} // load x12-x15 + b .Lnext_block\@ + +.Lxor_slowpath\@: + // Slow path: < 64 bytes remaining, or unaligned input or output buffer. + // We handle it by storing the 64 bytes of keystream to the stack, then + // XOR-ing the needed portion with the data. + + // Allocate keystream buffer + sub sp, #64 + mov r14, sp + + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Save keystream for x0-x3 + __ldrd r8, r9, sp, 96 + __ldrd r10, r11, sp, 104 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + stmia r14!, {X0-X3} + + // Save keystream for x4-x7 + __ldrd r8, r9, sp, 112 + __ldrd r10, r11, sp, 120 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + add r8, sp, #64 + stmia r14!, {X4-X7} + + // Save keystream for x8-x15 + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 128 + __ldrd r10, r11, sp, 136 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + stmia r14!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 144 + __ldrd r10, r11, sp, 152 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + stmia r14, {r2-r5} + + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN + // Registers: r8 is block counter, r12 is IN. + + ldr r9, [sp, #168] // LEN + ldr r14, [sp, #160] // OUT + cmp r9, #64 + mov r0, sp + movle r1, r9 + movgt r1, #64 + // r1 is number of bytes to XOR, in range [1, 64] + +.if __LINUX_ARM_ARCH__ < 6 + orr r2, r12, r14 + tst r2, #3 // IN or OUT misaligned? + bne .Lxor_next_byte\@ +.endif + + // XOR a word at a time +.rept 16 + subs r1, #4 + blt .Lxor_words_done\@ + ldr r2, [r12], #4 + ldr r3, [r0], #4 + eor r2, r2, r3 + str r2, [r14], #4 +.endr + b .Lxor_slowpath_done\@ +.Lxor_words_done\@: + ands r1, r1, #3 + beq .Lxor_slowpath_done\@ + + // XOR a byte at a time +.Lxor_next_byte\@: + ldrb r2, [r12], #1 + ldrb r3, [r0], #1 + eor r2, r2, r3 + strb r2, [r14], #1 + subs r1, #1 + bne .Lxor_next_byte\@ + +.Lxor_slowpath_done\@: + subs r9, #64 + add sp, #96 + bgt .Lprepare_for_next_block\@ + +.Ldone\@: +.endm // _chacha + +/* + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + * const struct chacha_state *state, int nrounds); + */ +ENTRY(chacha_doarm) + cmp r2, #0 // len == 0? + reteq lr + + ldr ip, [sp] + cmp ip, #12 + + push {r0-r2,r4-r11,lr} + + // Push state x0-x15 onto stack. + // Also store an extra copy of x10-x11 just before the state. + + add X12, r3, #48 + ldm X12, {X12,X13,X14,X15} + push {X12,X13,X14,X15} + sub sp, sp, #64 + + __ldrd X8_X10, X9_X11, r3, 40 + __strd X8_X10, X9_X11, sp, 8 + __strd X8_X10, X9_X11, sp, 56 + ldm r3, {X0-X9_X11} + __strd X0, X1, sp, 16 + __strd X2, X3, sp, 24 + __strd X4, X5, sp, 32 + __strd X6, X7, sp, 40 + __strd X8_X10, X9_X11, sp, 48 + + beq 1f + _chacha 20 + +0: add sp, #76 + pop {r4-r11, pc} + +1: _chacha 12 + b 0b +ENDPROC(chacha_doarm) + +/* + * void hchacha_block_arm(const struct chacha_state *state, + * u32 out[HCHACHA_OUT_WORDS], int nrounds); + */ +ENTRY(hchacha_block_arm) + push {r1,r4-r11,lr} + + cmp r2, #12 // ChaCha12 ? + + mov r14, r0 + ldmia r14!, {r0-r11} // load x0-x11 + push {r10-r11} // store x10-x11 to stack + ldm r14, {r10-r12,r14} // load x12-x15 + sub sp, #8 + + beq 1f + _chacha_permute 20 + + // Skip over (unused0-unused1, x10-x11) +0: add sp, #16 + + // Fix up rotations of x12-x15 + ror X12, X12, #drot + ror X13, X13, #drot + pop {r4} // load 'out' + ror X14, X14, #drot + ror X15, X15, #drot + + // Store (x0-x3,x12-x15) to 'out' + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} + + pop {r4-r11,pc} + +1: _chacha_permute 12 + b 0b +ENDPROC(hchacha_block_arm) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl new file mode 100644 index 000000000000..d57c6e2fc84a --- /dev/null +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -0,0 +1,1236 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arch +.globl poly1305_blocks_neon +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ clear is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + mov r3,#-1 + str r3,[$ctx,#28] @ impossible key power value +# ifndef __KERNEL__ + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +# endif +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +# if !defined(_WIN32) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# endif +# if defined(__APPLE__) || defined(_WIN32) + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + it ne + movne r11,r9 + adr r12,.Lpoly1305_emit + orr r11,r11,#1 @ thumb-ify addresses + orr r12,r12,#1 +# else + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + ite eq + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + +#if __ARM_ARCH__<7 + ldmia $ctx,{$h0-$r3} @ load context + add $ctx,$ctx,#20 + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] +#else + ldr lr,[$ctx,#36] @ is_base2_26 + ldmia $ctx!,{$h0-$h4} @ load hash value + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] + + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $r1,$h1,lsr#6 + adcs $r1,$r1,$h2,lsl#20 + mov $r2,$h2,lsr#12 + adcs $r2,$r2,$h3,lsl#14 + mov $r3,$h3,lsr#18 + adcs $r3,$r3,$h4,lsl#8 + mov $len,#0 + teq lr,#0 + str $len,[$ctx,#16] @ clear is_base2_26 + adc $len,$len,$h4,lsr#24 + + itttt ne + movne $h0,$r0 @ choose between radixes + movne $h1,$r1 + movne $h2,$r2 + movne $h3,$r3 + ldmia $ctx,{$r0-$r3} @ load key + it ne + movne $h4,$len +#endif + + mov lr,$inp + cmp $padbit,#0 + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.align 4 +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input + it hi + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmdb $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$ctx; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + stmdb sp!,{r4-r11} + + ldmia $ctx,{$h0-$h4} + +#if __ARM_ARCH__>=7 + ldr ip,[$ctx,#36] @ is_base2_26 + + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $g1,$h1,lsr#6 + adcs $g1,$g1,$h2,lsl#20 + mov $g2,$h2,lsr#12 + adcs $g2,$g2,$h3,lsl#14 + mov $g3,$h3,lsr#18 + adcs $g3,$g3,$h4,lsl#8 + mov $g4,#0 + adc $g4,$g4,$h4,lsr#24 + + tst ip,ip + itttt ne + movne $h0,$g0 + movne $h1,$g1 + movne $h2,$g2 + movne $h3,$g3 + it ne + movne $h4,$g4 +#endif + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r3,[$ctx,#48] @ first table element + cmp r3,#-1 @ is value impossible? + bne .Lno_init_neon + + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + +.Lno_init_neon: + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + cmp $len,#64 + blo .Lpoly1305_blocks + + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ set is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lhash_loaded + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lhash_loaded: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c new file mode 100644 index 000000000000..2603b0771f2c --- /dev/null +++ b/lib/crypto/arm/poly1305-glue.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +void __weak poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit) +{ +} + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + len = round_down(len, POLY1305_BLOCK_SIZE); + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon)) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init arm_poly1305_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + (elf_hwcap & HWCAP_NEON)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(arm_poly1305_mod_init); + +static void __exit arm_poly1305_mod_exit(void) +{ +} +module_exit(arm_poly1305_mod_exit); + +MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl new file mode 100644 index 000000000000..8122db7fd599 --- /dev/null +++ b/lib/crypto/arm/sha256-armv4.pl @@ -0,0 +1,724 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_blocks_arch +#endif +.align 5 + +.global sha256_blocks_arch +.type sha256_blocks_arch,%function +sha256_blocks_arch: +.Lsha256_blocks_arch: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha256_blocks_arch +#else + adr r3,.Lsha256_blocks_arch +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + sub $Ktbl,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_blocks_arch,.-sha256_blocks_arch +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + adr $Ktbl,.Lsha256_blocks_arch + sub $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256 + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] +# ifdef __thumb2__ + adr $Ktbl,.LARMv8 + sub $Ktbl,$Ktbl,#.LARMv8-K256 +# else + adrl $Ktbl,K256 +# endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S new file mode 100644 index 000000000000..ac2c9b01b22d --- /dev/null +++ b/lib/crypto/arm/sha256-ce.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel + */ + +#include +#include + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q7 + k1 .req q8 + rk .req r3 + + ta0 .req q9 + ta1 .req q10 + tb0 .req q10 + tb1 .req q9 + + dga .req q11 + dgb .req q12 + + dg0 .req q13 + dg1 .req q14 + dg2 .req q15 + + .macro add_only, ev, s0 + vmov dg2, dg0 + .ifnb \s0 + vld1.32 {k\ev}, [rk, :128]! + .endif + sha256h.32 dg0, dg1, tb\ev + sha256h2.32 dg1, dg2, tb\ev + .ifnb \s0 + vadd.u32 ta\ev, q\s0, k\ev + .endif + .endm + + .macro add_update, ev, s0, s1, s2, s3 + sha256su0.32 q\s0, q\s1 + add_only \ev, \s1 + sha256su1.32 q\s0, q\s2, q\s3 + .endm + + .align 6 +.Lsha256_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + * const u8 *data, size_t nblocks); + */ +ENTRY(sha256_ce_transform) + /* load state */ + vld1.32 {dga-dgb}, [r0] + + /* load input */ +0: vld1.32 {q0-q1}, [r1]! + vld1.32 {q2-q3}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 +#endif + + /* load first round constant */ + adr rk, .Lsha256_rcon + vld1.32 {k0}, [rk, :128]! + + vadd.u32 ta0, q0, k0 + vmov dg0, dga + vmov dg1, dgb + + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + + add_only 1, 1 + add_only 0, 2 + add_only 1, 3 + add_only 0 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1 + bne 0b + + /* store new state */ + vst1.32 {dga-dgb}, [r0] + bx lr +ENDPROC(sha256_ce_transform) diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c new file mode 100644 index 000000000000..109192e54b0f --- /dev/null +++ b/lib/crypto/arm/sha256.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include + +asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +EXPORT_SYMBOL_GPL(sha256_blocks_arch); +asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon)) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha256_ce_transform(state, data, nblocks); + else + sha256_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha256_blocks_arch(state, data, nblocks); + } +} +EXPORT_SYMBOL_GPL(sha256_blocks_simd); + +bool sha256_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_arm_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA2) + static_branch_enable(&have_ce); + } + return 0; +} +subsys_initcall(sha256_arm_mod_init); + +static void __exit sha256_arm_mod_exit(void) +{ +} +module_exit(sha256_arm_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 optimized for ARM"); -- cgit v1.2.3 From 61f86c70cf419c826ee3b9c175d1e355844d8b45 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:01 -0700 Subject: lib/crypto: arm64: Move arch/arm64/lib/crypto/ into lib/crypto/ Move the contents of arch/arm64/lib/crypto/ into lib/crypto/arm64/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/arm64/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/lib/.gitignore | 4 + arch/arm64/lib/Makefile | 3 - arch/arm64/lib/crypto/.gitignore | 3 - arch/arm64/lib/crypto/Kconfig | 20 - arch/arm64/lib/crypto/Makefile | 24 - arch/arm64/lib/crypto/chacha-neon-core.S | 805 --------------------------- arch/arm64/lib/crypto/chacha-neon-glue.c | 119 ---- arch/arm64/lib/crypto/poly1305-armv8.pl | 917 ------------------------------- arch/arm64/lib/crypto/poly1305-glue.c | 73 --- arch/arm64/lib/crypto/sha2-armv8.pl | 786 -------------------------- arch/arm64/lib/crypto/sha256-ce.S | 136 ----- arch/arm64/lib/crypto/sha256.c | 75 --- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 3 +- lib/crypto/arm64/.gitignore | 2 + lib/crypto/arm64/Kconfig | 20 + lib/crypto/arm64/Makefile | 24 + lib/crypto/arm64/chacha-neon-core.S | 805 +++++++++++++++++++++++++++ lib/crypto/arm64/chacha-neon-glue.c | 119 ++++ lib/crypto/arm64/poly1305-armv8.pl | 917 +++++++++++++++++++++++++++++++ lib/crypto/arm64/poly1305-glue.c | 73 +++ lib/crypto/arm64/sha2-armv8.pl | 786 ++++++++++++++++++++++++++ lib/crypto/arm64/sha256-ce.S | 136 +++++ lib/crypto/arm64/sha256.c | 75 +++ 24 files changed, 2964 insertions(+), 2963 deletions(-) create mode 100644 arch/arm64/lib/.gitignore delete mode 100644 arch/arm64/lib/crypto/.gitignore delete mode 100644 arch/arm64/lib/crypto/Kconfig delete mode 100644 arch/arm64/lib/crypto/Makefile delete mode 100644 arch/arm64/lib/crypto/chacha-neon-core.S delete mode 100644 arch/arm64/lib/crypto/chacha-neon-glue.c delete mode 100644 arch/arm64/lib/crypto/poly1305-armv8.pl delete mode 100644 arch/arm64/lib/crypto/poly1305-glue.c delete mode 100644 arch/arm64/lib/crypto/sha2-armv8.pl delete mode 100644 arch/arm64/lib/crypto/sha256-ce.S delete mode 100644 arch/arm64/lib/crypto/sha256.c create mode 100644 lib/crypto/arm64/Kconfig create mode 100644 lib/crypto/arm64/Makefile create mode 100644 lib/crypto/arm64/chacha-neon-core.S create mode 100644 lib/crypto/arm64/chacha-neon-glue.c create mode 100644 lib/crypto/arm64/poly1305-armv8.pl create mode 100644 lib/crypto/arm64/poly1305-glue.c create mode 100644 lib/crypto/arm64/sha2-armv8.pl create mode 100644 lib/crypto/arm64/sha256-ce.S create mode 100644 lib/crypto/arm64/sha256.c diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore new file mode 100644 index 000000000000..647d7a922e68 --- /dev/null +++ b/arch/arm64/lib/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 027bfa9689c6..d97e290619bc 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 - -obj-y += crypto/ - lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore deleted file mode 100644 index 12d74d8b03d0..000000000000 --- a/arch/arm64/lib/crypto/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-core.S -sha256-core.S diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig deleted file mode 100644 index 129a7685cb4c..000000000000 --- a/arch/arm64/lib/crypto/Kconfig +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_ARM64 - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile deleted file mode 100644 index 946c09903711..000000000000 --- a/arch/arm64/lib/crypto/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch -AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch - -obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o -sha256-arm64-y := sha256.o sha256-core.o -sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -$(obj)/sha256-core.S: $(src)/sha2-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S sha256-core.S diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S deleted file mode 100644 index 80079586ecc7..000000000000 --- a/arch/arm64/lib/crypto/chacha-neon-core.S +++ /dev/null @@ -1,805 +0,0 @@ -/* - * ChaCha/HChaCha NEON helper functions - * - * Copyright (C) 2016-2018 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Originally based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include - - .text - .align 6 - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers v0-v3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * The round count is given in w3. - * - * Clobbers: w3, x10, v4, v12 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - adr_l x10, ROT8 - ld1 {v12.4s}, [x10] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs w3, w3, #2 - b.ne .Ldoubleround - - ret -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - // w3: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - bl chacha_permute - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(chacha_block_xor_neon) - -SYM_FUNC_START(hchacha_block_neon) - // x0: Input state matrix, s - // x1: output (8 32-bit words) - // w2: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - ld1 {v0.4s-v3.4s}, [x0] - - mov w3, w2 - bl chacha_permute - - st1 {v0.4s}, [x1], #16 - st1 {v3.4s}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(hchacha_block_neon) - - a0 .req w12 - a1 .req w13 - a2 .req w14 - a3 .req w15 - a4 .req w16 - a5 .req w17 - a6 .req w19 - a7 .req w20 - a8 .req w21 - a9 .req w22 - a10 .req w23 - a11 .req w24 - a12 .req w25 - a13 .req w26 - a14 .req w27 - a15 .req w28 - - .align 6 -SYM_FUNC_START(chacha_4block_xor_neon) - frame_push 10 - - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - // w3: nrounds - // x4: byte count - - adr_l x10, .Lpermute - and x5, x4, #63 - add x10, x10, x5 - - // - // This function encrypts four consecutive ChaCha blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - // At the same time, a fifth block is encrypted in parallel using - // scalar registers - // - adr_l x9, CTRINC // ... and ROT8 - ld1 {v30.4s-v31.4s}, [x9] - - // x0..15[0-3] = s0..3[0..3] - add x8, x0, #16 - ld4r { v0.4s- v3.4s}, [x0] - ld4r { v4.4s- v7.4s}, [x8], #16 - ld4r { v8.4s-v11.4s}, [x8], #16 - ld4r {v12.4s-v15.4s}, [x8] - - mov a0, v0.s[0] - mov a1, v1.s[0] - mov a2, v2.s[0] - mov a3, v3.s[0] - mov a4, v4.s[0] - mov a5, v5.s[0] - mov a6, v6.s[0] - mov a7, v7.s[0] - mov a8, v8.s[0] - mov a9, v9.s[0] - mov a10, v10.s[0] - mov a11, v11.s[0] - mov a12, v12.s[0] - mov a13, v13.s[0] - mov a14, v14.s[0] - mov a15, v15.s[0] - - // x12 += counter values 1-4 - add v12.4s, v12.4s, v30.4s - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - rev32 v15.8h, v15.8h - ror a15, a15, #16 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #12 - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - - sri v4.4s, v16.4s, #20 - ror a4, a4, #20 - sri v5.4s, v17.4s, #20 - ror a5, a5, #20 - sri v6.4s, v18.4s, #20 - ror a6, a6, #20 - sri v7.4s, v19.4s, #20 - ror a7, a7, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #7 - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - - sri v4.4s, v16.4s, #25 - ror a4, a4, #25 - sri v5.4s, v17.4s, #25 - ror a5, a5, #25 - sri v6.4s, v18.4s, #25 - ror a6, a6, #25 - sri v7.4s, v19.4s, #25 - ror a7, a7, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - rev32 v15.8h, v15.8h - ror a15, a15, #16 - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #12 - shl v6.4s, v17.4s, #12 - shl v7.4s, v18.4s, #12 - shl v4.4s, v19.4s, #12 - - sri v5.4s, v16.4s, #20 - ror a5, a5, #20 - sri v6.4s, v17.4s, #20 - ror a6, a6, #20 - sri v7.4s, v18.4s, #20 - ror a7, a7, #20 - sri v4.4s, v19.4s, #20 - ror a4, a4, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #7 - shl v6.4s, v17.4s, #7 - shl v7.4s, v18.4s, #7 - shl v4.4s, v19.4s, #7 - - sri v5.4s, v16.4s, #25 - ror a5, a5, #25 - sri v6.4s, v17.4s, #25 - ror a6, a6, #25 - sri v7.4s, v18.4s, #25 - ror a7, a7, #25 - sri v4.4s, v19.4s, #25 - ror a4, a4, #25 - - subs w3, w3, #2 - b.ne .Ldoubleround4 - - ld4r {v16.4s-v19.4s}, [x0], #16 - ld4r {v20.4s-v23.4s}, [x0], #16 - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - add v0.4s, v0.4s, v16.4s - mov w6, v16.s[0] - mov w7, v17.s[0] - add v1.4s, v1.4s, v17.4s - mov w8, v18.s[0] - mov w9, v19.s[0] - add v2.4s, v2.4s, v18.4s - add a0, a0, w6 - add a1, a1, w7 - add v3.4s, v3.4s, v19.4s - add a2, a2, w8 - add a3, a3, w9 -CPU_BE( rev a0, a0 ) -CPU_BE( rev a1, a1 ) -CPU_BE( rev a2, a2 ) -CPU_BE( rev a3, a3 ) - - ld4r {v24.4s-v27.4s}, [x0], #16 - ld4r {v28.4s-v31.4s}, [x0] - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - add v4.4s, v4.4s, v20.4s - mov w6, v20.s[0] - mov w7, v21.s[0] - add v5.4s, v5.4s, v21.4s - mov w8, v22.s[0] - mov w9, v23.s[0] - add v6.4s, v6.4s, v22.4s - add a4, a4, w6 - add a5, a5, w7 - add v7.4s, v7.4s, v23.4s - add a6, a6, w8 - add a7, a7, w9 -CPU_BE( rev a4, a4 ) -CPU_BE( rev a5, a5 ) -CPU_BE( rev a6, a6 ) -CPU_BE( rev a7, a7 ) - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - add v8.4s, v8.4s, v24.4s - mov w6, v24.s[0] - mov w7, v25.s[0] - add v9.4s, v9.4s, v25.4s - mov w8, v26.s[0] - mov w9, v27.s[0] - add v10.4s, v10.4s, v26.4s - add a8, a8, w6 - add a9, a9, w7 - add v11.4s, v11.4s, v27.4s - add a10, a10, w8 - add a11, a11, w9 -CPU_BE( rev a8, a8 ) -CPU_BE( rev a9, a9 ) -CPU_BE( rev a10, a10 ) -CPU_BE( rev a11, a11 ) - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - add v12.4s, v12.4s, v28.4s - mov w6, v28.s[0] - mov w7, v29.s[0] - add v13.4s, v13.4s, v29.4s - mov w8, v30.s[0] - mov w9, v31.s[0] - add v14.4s, v14.4s, v30.4s - add a12, a12, w6 - add a13, a13, w7 - add v15.4s, v15.4s, v31.4s - add a14, a14, w8 - add a15, a15, w9 -CPU_BE( rev a12, a12 ) -CPU_BE( rev a13, a13 ) -CPU_BE( rev a14, a14 ) -CPU_BE( rev a15, a15 ) - - // interleave 32-bit words in state n, n+1 - ldp w6, w7, [x2], #64 - zip1 v16.4s, v0.4s, v1.4s - ldp w8, w9, [x2, #-56] - eor a0, a0, w6 - zip2 v17.4s, v0.4s, v1.4s - eor a1, a1, w7 - zip1 v18.4s, v2.4s, v3.4s - eor a2, a2, w8 - zip2 v19.4s, v2.4s, v3.4s - eor a3, a3, w9 - ldp w6, w7, [x2, #-48] - zip1 v20.4s, v4.4s, v5.4s - ldp w8, w9, [x2, #-40] - eor a4, a4, w6 - zip2 v21.4s, v4.4s, v5.4s - eor a5, a5, w7 - zip1 v22.4s, v6.4s, v7.4s - eor a6, a6, w8 - zip2 v23.4s, v6.4s, v7.4s - eor a7, a7, w9 - ldp w6, w7, [x2, #-32] - zip1 v24.4s, v8.4s, v9.4s - ldp w8, w9, [x2, #-24] - eor a8, a8, w6 - zip2 v25.4s, v8.4s, v9.4s - eor a9, a9, w7 - zip1 v26.4s, v10.4s, v11.4s - eor a10, a10, w8 - zip2 v27.4s, v10.4s, v11.4s - eor a11, a11, w9 - ldp w6, w7, [x2, #-16] - zip1 v28.4s, v12.4s, v13.4s - ldp w8, w9, [x2, #-8] - eor a12, a12, w6 - zip2 v29.4s, v12.4s, v13.4s - eor a13, a13, w7 - zip1 v30.4s, v14.4s, v15.4s - eor a14, a14, w8 - zip2 v31.4s, v14.4s, v15.4s - eor a15, a15, w9 - - add x3, x2, x4 - sub x3, x3, #128 // start of last block - - subs x5, x4, #128 - csel x2, x2, x3, ge - - // interleave 64-bit words in state n, n+2 - zip1 v0.2d, v16.2d, v18.2d - zip2 v4.2d, v16.2d, v18.2d - stp a0, a1, [x1], #64 - zip1 v8.2d, v17.2d, v19.2d - zip2 v12.2d, v17.2d, v19.2d - stp a2, a3, [x1, #-56] - - subs x6, x4, #192 - ld1 {v16.16b-v19.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v1.2d, v20.2d, v22.2d - zip2 v5.2d, v20.2d, v22.2d - stp a4, a5, [x1, #-48] - zip1 v9.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - stp a6, a7, [x1, #-40] - - subs x7, x4, #256 - ld1 {v20.16b-v23.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v2.2d, v24.2d, v26.2d - zip2 v6.2d, v24.2d, v26.2d - stp a8, a9, [x1, #-32] - zip1 v10.2d, v25.2d, v27.2d - zip2 v14.2d, v25.2d, v27.2d - stp a10, a11, [x1, #-24] - - subs x8, x4, #320 - ld1 {v24.16b-v27.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v3.2d, v28.2d, v30.2d - zip2 v7.2d, v28.2d, v30.2d - stp a12, a13, [x1, #-16] - zip1 v11.2d, v29.2d, v31.2d - zip2 v15.2d, v29.2d, v31.2d - stp a14, a15, [x1, #-8] - - tbnz x5, #63, .Lt128 - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v1.16b - eor v18.16b, v18.16b, v2.16b - eor v19.16b, v19.16b, v3.16b - - tbnz x6, #63, .Lt192 - - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v6.16b - eor v23.16b, v23.16b, v7.16b - - st1 {v16.16b-v19.16b}, [x1], #64 - tbnz x7, #63, .Lt256 - - eor v24.16b, v24.16b, v8.16b - eor v25.16b, v25.16b, v9.16b - eor v26.16b, v26.16b, v10.16b - eor v27.16b, v27.16b, v11.16b - - st1 {v20.16b-v23.16b}, [x1], #64 - tbnz x8, #63, .Lt320 - - eor v28.16b, v28.16b, v12.16b - eor v29.16b, v29.16b, v13.16b - eor v30.16b, v30.16b, v14.16b - eor v31.16b, v31.16b, v15.16b - - st1 {v24.16b-v27.16b}, [x1], #64 - st1 {v28.16b-v31.16b}, [x1] - -.Lout: frame_pop - ret - - // fewer than 192 bytes of in/output -.Lt192: cbz x5, 1f // exactly 128 bytes? - ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - tbl v28.16b, {v4.16b-v7.16b}, v28.16b - tbl v29.16b, {v4.16b-v7.16b}, v29.16b - tbl v30.16b, {v4.16b-v7.16b}, v30.16b - tbl v31.16b, {v4.16b-v7.16b}, v31.16b - -0: eor v20.16b, v20.16b, v28.16b - eor v21.16b, v21.16b, v29.16b - eor v22.16b, v22.16b, v30.16b - eor v23.16b, v23.16b, v31.16b - st1 {v20.16b-v23.16b}, [x5] // overlapping stores -1: st1 {v16.16b-v19.16b}, [x1] - b .Lout - - // fewer than 128 bytes of in/output -.Lt128: ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - sub x1, x1, #64 - tbl v28.16b, {v0.16b-v3.16b}, v28.16b - tbl v29.16b, {v0.16b-v3.16b}, v29.16b - tbl v30.16b, {v0.16b-v3.16b}, v30.16b - tbl v31.16b, {v0.16b-v3.16b}, v31.16b - ld1 {v16.16b-v19.16b}, [x1] // reload first output block - b 0b - - // fewer than 256 bytes of in/output -.Lt256: cbz x6, 2f // exactly 192 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x6, x6, x1 - tbl v0.16b, {v8.16b-v11.16b}, v4.16b - tbl v1.16b, {v8.16b-v11.16b}, v5.16b - tbl v2.16b, {v8.16b-v11.16b}, v6.16b - tbl v3.16b, {v8.16b-v11.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x6] // overlapping stores -2: st1 {v20.16b-v23.16b}, [x1] - b .Lout - - // fewer than 320 bytes of in/output -.Lt320: cbz x7, 3f // exactly 256 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x7, x7, x1 - tbl v0.16b, {v12.16b-v15.16b}, v4.16b - tbl v1.16b, {v12.16b-v15.16b}, v5.16b - tbl v2.16b, {v12.16b-v15.16b}, v6.16b - tbl v3.16b, {v12.16b-v15.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x7] // overlapping stores -3: st1 {v24.16b-v27.16b}, [x1] - b .Lout -SYM_FUNC_END(chacha_4block_xor_neon) - - .section ".rodata", "a", %progbits - .align L1_CACHE_SHIFT -.Lpermute: - .set .Li, 0 - .rept 128 - .byte (.Li - 64) - .set .Li, .Li + 1 - .endr - -CTRINC: .word 1, 2, 3, 4 -ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c deleted file mode 100644 index d0188f974ca5..000000000000 --- a/arch/arm64/lib/crypto/chacha-neon-glue.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * ChaCha and HChaCha functions (ARM64 optimized) - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, int nrounds); -asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, - int nrounds, int bytes); -asmlinkage void hchacha_block_neon(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, - int bytes, int nrounds) -{ - while (bytes > 0) { - int l = min(bytes, CHACHA_BLOCK_SIZE * 5); - - if (l <= CHACHA_BLOCK_SIZE) { - u8 buf[CHACHA_BLOCK_SIZE]; - - memcpy(buf, src, l); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, l); - state->x[12] += 1; - break; - } - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_neon); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl deleted file mode 100644 index 22c9069c0650..000000000000 --- a/arch/arm64/lib/crypto/poly1305-armv8.pl +++ /dev/null @@ -1,917 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# This module implements Poly1305 hash for ARMv8. -# -# June 2015 -# -# Numbers are cycles per processed byte with poly1305_blocks alone. -# -# IALU/gcc-4.9 NEON -# -# Apple A7 1.86/+5% 0.72 -# Cortex-A53 2.69/+58% 1.47 -# Cortex-A57 2.70/+7% 1.14 -# Denver 1.64/+50% 1.18(*) -# X-Gene 2.13/+68% 2.27 -# Mongoose 1.77/+75% 1.12 -# Kryo 2.70/+55% 1.13 -# ThunderX2 1.17/+95% 1.36 -# -# (*) estimate based on resources availability is less than 1.0, -# i.e. measured result is worse than expected, presumably binary -# translator is not almighty; - -$flavour=shift; -$output=shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); -my ($mac,$nonce)=($inp,$len); - -my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -.extern OPENSSL_armcap_P -#endif - -.text - -// forward "declarations" are required for Apple -.globl poly1305_blocks -.globl poly1305_emit - -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: - cmp $inp,xzr - stp xzr,xzr,[$ctx] // zero hash value - stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] - - csel x0,xzr,x0,eq - b.eq .Lno_key - -#ifndef __KERNEL__ - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] -#endif - - ldp $r0,$r1,[$inp] // load key - mov $s1,#0xfffffffc0fffffff - movk $s1,#0x0fff,lsl#48 -#ifdef __AARCH64EB__ - rev $r0,$r0 // flip bytes - rev $r1,$r1 -#endif - and $r0,$r0,$s1 // &=0ffffffc0fffffff - and $s1,$s1,#-4 - and $r1,$r1,$s1 // &=0ffffffc0ffffffc - mov w#$s1,#-1 - stp $r0,$r1,[$ctx,#32] // save key value - str w#$s1,[$ctx,#48] // impossible key power value - -#ifndef __KERNEL__ - tst w17,#ARMV7_NEON - - adr $d0,.Lpoly1305_blocks - adr $r0,.Lpoly1305_blocks_neon - adr $d1,.Lpoly1305_emit - - csel $d0,$d0,$r0,eq - -# ifdef __ILP32__ - stp w#$d0,w#$d1,[$len] -# else - stp $d0,$d1,[$len] -# endif -#endif - mov x0,#1 -.Lno_key: - ret -.size poly1305_init,.-poly1305_init - -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - ands $len,$len,#-16 - b.eq .Lno_data - - ldp $h0,$h1,[$ctx] // load hash value - ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] - ldp $r0,$r1,[$ctx,#32] // load key value - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp x17,#0 // is_base2_26? - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - -.Loop: - ldp $t0,$t1,[$inp],#16 // load input - sub $len,$len,#16 -#ifdef __AARCH64EB__ - rev $t0,$t0 - rev $t1,$t1 -#endif - adds $h0,$h0,$t0 // accumulate input - adcs $h1,$h1,$t1 - - mul $d0,$h0,$r0 // h0*r0 - adc $h2,$h2,$padbit - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - cbnz $len,.Loop - - stp $h0,$h1,[$ctx] // store hash value - stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] - -.Lno_data: - ret -.size poly1305_blocks,.-poly1305_blocks - -.type poly1305_emit,%function -.align 5 -poly1305_emit: -.Lpoly1305_emit: - ldp $h0,$h1,[$ctx] // load hash base 2^64 - ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] - ldp $t0,$t1,[$nonce] // load nonce - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp $r0,#0 // is_base2_26? - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - - adds $d0,$h0,#5 // compare to modulus - adcs $d1,$h1,xzr - adc $d2,$h2,xzr - - tst $d2,#-4 // see if it's carried/borrowed - - csel $h0,$h0,$d0,eq - csel $h1,$h1,$d1,eq - -#ifdef __AARCH64EB__ - ror $t0,$t0,#32 // flip nonce words - ror $t1,$t1,#32 -#endif - adds $h0,$h0,$t0 // accumulate nonce - adc $h1,$h1,$t1 -#ifdef __AARCH64EB__ - rev $h0,$h0 // flip output bytes - rev $h1,$h1 -#endif - stp $h0,$h1,[$mac] // write result - - ret -.size poly1305_emit,.-poly1305_emit -___ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); -my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); -my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); -my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); -my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); -my ($T0,$T1,$MASK) = map("v$_",(29..31)); - -my ($in2,$zeros)=("x16","x17"); -my $is_base2_26 = $zeros; # borrow - -$code.=<<___; -.type poly1305_mult,%function -.align 5 -poly1305_mult: - mul $d0,$h0,$r0 // h0*r0 - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - ret -.size poly1305_mult,.-poly1305_mult - -.type poly1305_splat,%function -.align 4 -poly1305_splat: - and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x13,$h0,#26,#26 - extr x14,$h1,$h0,#52 - and x14,x14,#0x03ffffff - ubfx x15,$h1,#14,#26 - extr x16,$h2,$h1,#40 - - str w12,[$ctx,#16*0] // r0 - add w12,w13,w13,lsl#2 // r1*5 - str w13,[$ctx,#16*1] // r1 - add w13,w14,w14,lsl#2 // r2*5 - str w12,[$ctx,#16*2] // s1 - str w14,[$ctx,#16*3] // r2 - add w14,w15,w15,lsl#2 // r3*5 - str w13,[$ctx,#16*4] // s2 - str w15,[$ctx,#16*5] // r3 - add w15,w16,w16,lsl#2 // r4*5 - str w14,[$ctx,#16*6] // s3 - str w16,[$ctx,#16*7] // r4 - str w15,[$ctx,#16*8] // s4 - - ret -.size poly1305_splat,.-poly1305_splat - -#ifdef __KERNEL__ -.globl poly1305_blocks_neon -#endif -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: -.Lpoly1305_blocks_neon: - ldr $is_base2_26,[$ctx,#24] - cmp $len,#128 - b.lo .Lpoly1305_blocks - - .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - cbz $is_base2_26,.Lbase2_64_neon - - ldp w10,w11,[$ctx] // load hash value base 2^26 - ldp w12,w13,[$ctx,#8] - ldr w14,[$ctx,#16] - - tst $len,#31 - b.eq .Leven_neon - - ldp $r0,$r1,[$ctx,#32] // load key value - - add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr $h1,x12,#12 - adds $h0,$h0,x12,lsl#52 - add $h1,$h1,x13,lsl#14 - adc $h1,$h1,xzr - lsr $h2,x14,#24 - adds $h1,$h1,x14,lsl#40 - adc $d2,$h2,xzr // can be partially reduced... - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - b .Leven_neon - -.align 4 -.Lbase2_64_neon: - ldp $r0,$r1,[$ctx,#32] // load key value - - ldp $h0,$h1,[$ctx] // load hash value base 2^64 - ldr $h2,[$ctx,#16] - - tst $len,#31 - b.eq .Linit_neon - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - -.Linit_neon: - ldr w17,[$ctx,#48] // first table element - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - cmp w17,#-1 // is value impossible? - b.ne .Leven_neon - - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - - ////////////////////////////////// initialize r^n table - mov $h0,$r0 // r^1 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - mov $h1,$r1 - mov $h2,xzr - add $ctx,$ctx,#48+12 - bl poly1305_splat - - bl poly1305_mult // r^2 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^3 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^4 - sub $ctx,$ctx,#4 - bl poly1305_splat - sub $ctx,$ctx,#48 // restore original $ctx - b .Ldo_neon - -.align 4 -.Leven_neon: - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - -.Ldo_neon: - ldp x8,x12,[$inp,#32] // inp[2:3] - subs $len,$len,#64 - ldp x9,x13,[$inp,#48] - add $in2,$inp,#96 - adrp $zeros,.Lzeros - add $zeros,$zeros,#:lo12:.Lzeros - - lsl $padbit,$padbit,#24 - add x15,$ctx,#48 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN23_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN23_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - fmov $IN23_2,x8 - fmov $IN23_3,x10 - fmov $IN23_4,x12 - - ldp x8,x12,[$inp],#16 // inp[0:1] - ldp x9,x13,[$inp],#48 - - ld1 {$R0,$R1,$S1,$R2},[x15],#64 - ld1 {$S2,$R3,$S3,$R4},[x15],#64 - ld1 {$S4},[x15] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN01_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN01_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - movi $MASK.2d,#-1 - fmov $IN01_2,x8 - fmov $IN01_3,x10 - fmov $IN01_4,x12 - ushr $MASK.2d,$MASK.2d,#38 - - b.ls .Lskip_loop - -.align 4 -.Loop_neon: - //////////////////////////////////////////////////////////////// - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - // \___________________/ - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - // \___________________/ \____________________/ - // - // Note that we start with inp[2:3]*r^2. This is because it - // doesn't depend on reduction in previous iteration. - //////////////////////////////////////////////////////////////// - // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 - // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 - // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 - // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 - // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - subs $len,$len,#64 - umull $ACC4,$IN23_0,${R4}[2] - csel $in2,$zeros,$in2,lo - umull $ACC3,$IN23_0,${R3}[2] - umull $ACC2,$IN23_0,${R2}[2] - ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) - umull $ACC1,$IN23_0,${R1}[2] - ldp x9,x13,[$in2],#48 - umull $ACC0,$IN23_0,${R0}[2] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - umlal $ACC4,$IN23_1,${R3}[2] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC3,$IN23_1,${R2}[2] - and x5,x9,#0x03ffffff - umlal $ACC2,$IN23_1,${R1}[2] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN23_1,${R0}[2] - ubfx x7,x9,#26,#26 - umlal $ACC0,$IN23_1,${S4}[2] - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - - umlal $ACC4,$IN23_2,${R2}[2] - extr x8,x12,x8,#52 - umlal $ACC3,$IN23_2,${R1}[2] - extr x9,x13,x9,#52 - umlal $ACC2,$IN23_2,${R0}[2] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC1,$IN23_2,${S4}[2] - fmov $IN23_0,x4 - umlal $ACC0,$IN23_2,${S3}[2] - and x8,x8,#0x03ffffff - - umlal $ACC4,$IN23_3,${R1}[2] - and x9,x9,#0x03ffffff - umlal $ACC3,$IN23_3,${R0}[2] - ubfx x10,x12,#14,#26 - umlal $ACC2,$IN23_3,${S4}[2] - ubfx x11,x13,#14,#26 - umlal $ACC1,$IN23_3,${S3}[2] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC0,$IN23_3,${S2}[2] - fmov $IN23_1,x6 - - add $IN01_2,$IN01_2,$H2 - add x12,$padbit,x12,lsr#40 - umlal $ACC4,$IN23_4,${R0}[2] - add x13,$padbit,x13,lsr#40 - umlal $ACC3,$IN23_4,${S4}[2] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC2,$IN23_4,${S3}[2] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN23_4,${S2}[2] - fmov $IN23_2,x8 - umlal $ACC0,$IN23_4,${S1}[2] - fmov $IN23_3,x10 - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4 and accumulate - - add $IN01_0,$IN01_0,$H0 - fmov $IN23_4,x12 - umlal $ACC3,$IN01_2,${R1}[0] - ldp x8,x12,[$inp],#16 // inp[0:1] - umlal $ACC0,$IN01_2,${S3}[0] - ldp x9,x13,[$inp],#48 - umlal $ACC4,$IN01_2,${R2}[0] - umlal $ACC1,$IN01_2,${S4}[0] - umlal $ACC2,$IN01_2,${R0}[0] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3}[0] - umlal $ACC4,$IN01_0,${R4}[0] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC2,$IN01_0,${R2}[0] - and x5,x9,#0x03ffffff - umlal $ACC0,$IN01_0,${R0}[0] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN01_0,${R1}[0] - ubfx x7,x9,#26,#26 - - add $IN01_3,$IN01_3,$H3 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - umlal $ACC3,$IN01_1,${R2}[0] - extr x8,x12,x8,#52 - umlal $ACC4,$IN01_1,${R3}[0] - extr x9,x13,x9,#52 - umlal $ACC0,$IN01_1,${S4}[0] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC2,$IN01_1,${R1}[0] - fmov $IN01_0,x4 - umlal $ACC1,$IN01_1,${R0}[0] - and x8,x8,#0x03ffffff - - add $IN01_4,$IN01_4,$H4 - and x9,x9,#0x03ffffff - umlal $ACC3,$IN01_3,${R0}[0] - ubfx x10,x12,#14,#26 - umlal $ACC0,$IN01_3,${S2}[0] - ubfx x11,x13,#14,#26 - umlal $ACC4,$IN01_3,${R1}[0] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC1,$IN01_3,${S3}[0] - fmov $IN01_1,x6 - umlal $ACC2,$IN01_3,${S4}[0] - add x12,$padbit,x12,lsr#40 - - umlal $ACC3,$IN01_4,${S4}[0] - add x13,$padbit,x13,lsr#40 - umlal $ACC0,$IN01_4,${S1}[0] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC4,$IN01_4,${R0}[0] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN01_4,${S2}[0] - fmov $IN01_2,x8 - umlal $ACC2,$IN01_4,${S3}[0] - fmov $IN01_3,x10 - fmov $IN01_4,x12 - - ///////////////////////////////////////////////////////////////// - // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - // and P. Schwabe - // - // [see discussion in poly1305-armv4 module] - - ushr $T0.2d,$ACC3,#26 - xtn $H3,$ACC3 - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - bic $H3,#0xfc,lsl#24 // &=0x03ffffff - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - xtn $H4,$ACC4 - ushr $T1.2d,$ACC1,#26 - xtn $H1,$ACC1 - bic $H4,#0xfc,lsl#24 - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - shrn $T1.2s,$ACC2,#26 - xtn $H2,$ACC2 - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - bic $H1,#0xfc,lsl#24 - add $H3,$H3,$T1.2s // h2 -> h3 - bic $H2,#0xfc,lsl#24 - - shrn $T0.2s,$ACC0,#26 - xtn $H0,$ACC0 - ushr $T1.2s,$H3,#26 - bic $H3,#0xfc,lsl#24 - bic $H0,#0xfc,lsl#24 - add $H1,$H1,$T0.2s // h0 -> h1 - add $H4,$H4,$T1.2s // h3 -> h4 - - b.hi .Loop_neon - -.Lskip_loop: - dup $IN23_2,${IN23_2}[0] - add $IN01_2,$IN01_2,$H2 - - //////////////////////////////////////////////////////////////// - // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - adds $len,$len,#32 - b.ne .Long_tail - - dup $IN23_2,${IN01_2}[0] - add $IN23_0,$IN01_0,$H0 - add $IN23_3,$IN01_3,$H3 - add $IN23_1,$IN01_1,$H1 - add $IN23_4,$IN01_4,$H4 - -.Long_tail: - dup $IN23_0,${IN23_0}[0] - umull2 $ACC0,$IN23_2,${S3} - umull2 $ACC3,$IN23_2,${R1} - umull2 $ACC4,$IN23_2,${R2} - umull2 $ACC2,$IN23_2,${R0} - umull2 $ACC1,$IN23_2,${S4} - - dup $IN23_1,${IN23_1}[0] - umlal2 $ACC0,$IN23_0,${R0} - umlal2 $ACC2,$IN23_0,${R2} - umlal2 $ACC3,$IN23_0,${R3} - umlal2 $ACC4,$IN23_0,${R4} - umlal2 $ACC1,$IN23_0,${R1} - - dup $IN23_3,${IN23_3}[0] - umlal2 $ACC0,$IN23_1,${S4} - umlal2 $ACC3,$IN23_1,${R2} - umlal2 $ACC2,$IN23_1,${R1} - umlal2 $ACC4,$IN23_1,${R3} - umlal2 $ACC1,$IN23_1,${R0} - - dup $IN23_4,${IN23_4}[0] - umlal2 $ACC3,$IN23_3,${R0} - umlal2 $ACC4,$IN23_3,${R1} - umlal2 $ACC0,$IN23_3,${S2} - umlal2 $ACC1,$IN23_3,${S3} - umlal2 $ACC2,$IN23_3,${S4} - - umlal2 $ACC3,$IN23_4,${S4} - umlal2 $ACC0,$IN23_4,${S1} - umlal2 $ACC4,$IN23_4,${R0} - umlal2 $ACC1,$IN23_4,${S2} - umlal2 $ACC2,$IN23_4,${S3} - - b.eq .Lshort_tail - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4:r^3 and accumulate - - add $IN01_0,$IN01_0,$H0 - umlal $ACC3,$IN01_2,${R1} - umlal $ACC0,$IN01_2,${S3} - umlal $ACC4,$IN01_2,${R2} - umlal $ACC1,$IN01_2,${S4} - umlal $ACC2,$IN01_2,${R0} - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3} - umlal $ACC0,$IN01_0,${R0} - umlal $ACC4,$IN01_0,${R4} - umlal $ACC1,$IN01_0,${R1} - umlal $ACC2,$IN01_0,${R2} - - add $IN01_3,$IN01_3,$H3 - umlal $ACC3,$IN01_1,${R2} - umlal $ACC0,$IN01_1,${S4} - umlal $ACC4,$IN01_1,${R3} - umlal $ACC1,$IN01_1,${R0} - umlal $ACC2,$IN01_1,${R1} - - add $IN01_4,$IN01_4,$H4 - umlal $ACC3,$IN01_3,${R0} - umlal $ACC0,$IN01_3,${S2} - umlal $ACC4,$IN01_3,${R1} - umlal $ACC1,$IN01_3,${S3} - umlal $ACC2,$IN01_3,${S4} - - umlal $ACC3,$IN01_4,${S4} - umlal $ACC0,$IN01_4,${S1} - umlal $ACC4,$IN01_4,${R0} - umlal $ACC1,$IN01_4,${S2} - umlal $ACC2,$IN01_4,${S3} - -.Lshort_tail: - //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC3,$ACC3,$ACC3 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC4,$ACC4,$ACC4 - ldp d12,d13,[sp,#48] - addp $ACC1,$ACC1,$ACC1 - ldp d14,d15,[sp,#64] - addp $ACC2,$ACC2,$ACC2 - ldr x30,[sp,#8] - - //////////////////////////////////////////////////////////////// - // lazy reduction, but without narrowing - - ushr $T0.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - and $ACC4,$ACC4,$MASK.2d - ushr $T1.2d,$ACC1,#26 - and $ACC1,$ACC1,$MASK.2d - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - ushr $T1.2d,$ACC2,#26 - and $ACC2,$ACC2,$MASK.2d - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - add $ACC3,$ACC3,$T1.2d // h2 -> h3 - - ushr $T0.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - ushr $T1.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - add $ACC1,$ACC1,$T0.2d // h0 -> h1 - add $ACC4,$ACC4,$T1.2d // h3 -> h4 - - //////////////////////////////////////////////////////////////// - // write the result, can be partially reduced - - st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 - mov x4,#1 - st1 {$ACC4}[0],[$ctx] - str x4,[$ctx,#8] // set is_base2_26 - - ldr x29,[sp],#80 - .inst 0xd50323bf // autiasp - ret -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.pushsection .rodata -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0 -.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" -.popsection - -.align 2 -#if !defined(__KERNEL__) && !defined(_WIN64) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -___ - -foreach (split("\n",$code)) { - s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or - s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or - (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or - (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or - (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or - (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or - (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); - - s/\.[124]([sd])\[/.$1\[/; - s/w#x([0-9]+)/w$1/g; - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c deleted file mode 100644 index c9a74766785b..000000000000 --- a/arch/arm64/lib/crypto/poly1305-glue.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon)) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init neon_poly1305_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(neon_poly1305_mod_init); - -static void __exit neon_poly1305_mod_exit(void) -{ -} -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl deleted file mode 100644 index 4aebd20c498b..000000000000 --- a/arch/arm64/lib/crypto/sha2-armv8.pl +++ /dev/null @@ -1,786 +0,0 @@ -#! /usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. -# -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# SHA256/512 for ARMv8. -# -# Performance in cycles per processed byte and improvement coefficient -# over code generated with "default" compiler: -# -# SHA256-hw SHA256(*) SHA512 -# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -# Denver 2.01 10.5 (+26%) 6.70 (+8%) -# X-Gene 20.0 (+100%) 12.8 (+300%(***)) -# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -# -# (*) Software SHA256 results are of lesser relevance, presented -# mostly for informational purposes. -# (**) The result is a trade-off: it's possible to improve it by -# 10% (or by 1 cycle per round), but at the cost of 20% loss -# on Cortex-A53 (or by 4 cycles per round). -# (***) Super-impressive coefficients over gcc-generated code are -# indication of some compiler "pathology", most notably code -# generated with -mgeneral-regs-only is significantly faster -# and the gap is only 40-90%. -# -# October 2016. -# -# Originally it was reckoned that it makes no sense to implement NEON -# version of SHA256 for 64-bit processors. This is because performance -# improvement on most wide-spread Cortex-A5x processors was observed -# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was -# observed that 32-bit NEON SHA256 performs significantly better than -# 64-bit scalar version on *some* of the more recent processors. As -# result 64-bit NEON version of SHA256 was added to provide best -# all-round performance. For example it executes ~30% faster on X-Gene -# and Mongoose. [For reference, NEON version of SHA512 is bound to -# deliver much less improvement, likely *negative* on Cortex-A5x. -# Which is why NEON support is limited to SHA256.] - -$output=pop; -$flavour=pop; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open OUT,"| \"$^X\" $xlate $flavour $output"; - *STDOUT=*OUT; -} else { - open STDOUT,">$output"; -} - -if ($output =~ /512/) { - $BITS=512; - $SZ=8; - @Sigma0=(28,34,39); - @Sigma1=(14,18,41); - @sigma0=(1, 8, 7); - @sigma1=(19,61, 6); - $rounds=80; - $reg_t="x"; -} else { - $BITS=256; - $SZ=4; - @Sigma0=( 2,13,22); - @Sigma1=( 6,11,25); - @sigma0=( 7,18, 3); - @sigma1=(17,19,10); - $rounds=64; - $reg_t="w"; -} - -$func="sha${BITS}_blocks_arch"; - -($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); - -@X=map("$reg_t$_",(3..15,0..2)); -@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); -($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); - -sub BODY_00_xx { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; -my $j=($i+1)&15; -my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); - $T0=@X[$i+3] if ($i<11); - -$code.=<<___ if ($i<16); -#ifndef __AARCH64EB__ - rev @X[$i],@X[$i] // $i -#endif -___ -$code.=<<___ if ($i<13 && ($i&1)); - ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ -___ -$code.=<<___ if ($i==13); - ldp @X[14],@X[15],[$inp] -___ -$code.=<<___ if ($i>=14); - ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] -___ -$code.=<<___ if ($i>0 && $i<16); - add $a,$a,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=11); - str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] -___ -# While ARMv8 specifies merged rotate-n-logical operation such as -# 'eor x,y,z,ror#n', it was found to negatively affect performance -# on Apple A7. The reason seems to be that it requires even 'y' to -# be available earlier. This means that such merged instruction is -# not necessarily best choice on critical path... On the other hand -# Cortex-A5x handles merged instructions much better than disjoint -# rotate and logical... See (**) footnote above. -$code.=<<___ if ($i<15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` - and $t1,$f,$e - bic $t2,$g,$e - add $h,$h,@X[$i&15] // h+=X[i] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) - ror $T0,$a,#$Sigma0[0] - add $h,$h,$t1 // h+=Ch(e,f,g) - eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` - add $h,$h,$t0 // h+=Sigma1(e) - and $t3,$t3,$t2 // (b^c)&=(a^b) - add $d,$d,$h // d+=h - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - //add $h,$h,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - ror $T1,@X[($j+1)&15],#$sigma0[0] - and $t1,$f,$e - ror $T2,@X[($j+14)&15],#$sigma1[0] - bic $t2,$g,$e - ror $T0,$a,#$Sigma0[0] - add $h,$h,@X[$i&15] // h+=X[i] - eor $t0,$t0,$e,ror#$Sigma1[1] - eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) - eor $T0,$T0,$a,ror#$Sigma0[1] - add $h,$h,$t1 // h+=Ch(e,f,g) - and $t3,$t3,$t2 // (b^c)&=(a^b) - eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] - eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) - add $h,$h,$t0 // h+=Sigma1(e) - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) - eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) - add @X[$j],@X[$j],@X[($j+9)&15] - add $d,$d,$h // d+=h - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - add @X[$j],@X[$j],$T1 - add $h,$h,$t1 // h+=Sigma0(a) - add @X[$j],@X[$j],$T2 -___ - ($t2,$t3)=($t3,$t2); -} - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#endif - -.text - -.extern OPENSSL_armcap_P -.globl $func -.type $func,%function -.align 6 -$func: -___ -$code.=<<___ if ($SZ==4); -#ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry - tst w16,#ARMV7_NEON - b.ne .Lneon_entry -#endif -___ -$code.=<<___; - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*$SZ - - ldp $A,$B,[$ctx] // load context - ldp $C,$D,[$ctx,#2*$SZ] - ldp $E,$F,[$ctx,#4*$SZ] - add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input - ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,.LK$BITS - stp $ctx,$num,[x29,#96] - -.Loop: - ldp @X[0],@X[1],[$inp],#2*$SZ - ldr $t2,[$Ktbl],#$SZ // *K++ - eor $t3,$B,$C // magic seed - str $inp,[x29,#112] -___ -for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=".Loop_16_xx:\n"; -for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - cbnz $t2,.Loop_16_xx - - ldp $ctx,$num,[x29,#96] - ldr $inp,[x29,#112] - sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind - - ldp @X[0],@X[1],[$ctx] - ldp @X[2],@X[3],[$ctx,#2*$SZ] - add $inp,$inp,#14*$SZ // advance input pointer - ldp @X[4],@X[5],[$ctx,#4*$SZ] - add $A,$A,@X[0] - ldp @X[6],@X[7],[$ctx,#6*$SZ] - add $B,$B,@X[1] - add $C,$C,@X[2] - add $D,$D,@X[3] - stp $A,$B,[$ctx] - add $E,$E,@X[4] - add $F,$F,@X[5] - stp $C,$D,[$ctx,#2*$SZ] - add $G,$G,@X[6] - add $H,$H,@X[7] - cmp $inp,$num - stp $E,$F,[$ctx,#4*$SZ] - stp $G,$H,[$ctx,#6*$SZ] - b.ne .Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*$SZ - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - ret -.size $func,.-$func - -.align 6 -.type .LK$BITS,%object -.LK$BITS: -___ -$code.=<<___ if ($SZ==8); - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - .quad 0 // terminator -___ -$code.=<<___ if ($SZ==4); - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0 //terminator -___ -$code.=<<___; -.size .LK$BITS,.-.LK$BITS -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ - .long OPENSSL_armcap_P-. -# else - .quad OPENSSL_armcap_P-. -# endif -#endif -.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " -.align 2 -___ - -if ($SZ==4) { -my $Ktbl="x3"; - -my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); -my @MSG=map("v$_.16b",(4..7)); -my ($W0,$W1)=("v16.4s","v17.4s"); -my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); - -$code.=<<___; -#ifndef __KERNEL__ -.type sha256_block_armv8,%function -.align 6 -sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,.LK256 - -.Loop_hw: - ld1 {@MSG[0]-@MSG[3]},[$inp],#64 - sub $num,$num,#1 - ld1.32 {$W0},[$Ktbl],#16 - rev32 @MSG[0],@MSG[0] - rev32 @MSG[1],@MSG[1] - rev32 @MSG[2],@MSG[2] - rev32 @MSG[3],@MSG[3] - orr $ABCD_SAVE,$ABCD,$ABCD // offload - orr $EFGH_SAVE,$EFGH,$EFGH -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - ld1.32 {$W0},[$Ktbl],#16 - add.i32 $W1,$W1,@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - ld1.32 {$W1},[$Ktbl] - add.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - add.i32 $W1,$W1,@MSG[3] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - add.i32 $ABCD,$ABCD,$ABCD_SAVE - add.i32 $EFGH,$EFGH,$EFGH_SAVE - - cbnz $num,.Loop_hw - - st1.32 {$ABCD,$EFGH},[$ctx] - - ldr x29,[sp],#16 - ret -.size sha256_block_armv8,.-sha256_block_armv8 -#endif -___ -} - -if ($SZ==4) { ######################################### NEON stuff # -# You'll surely note a lot of similarities with sha256-armv4 module, -# and of course it's not a coincidence. sha256-armv4 was used as -# initial template, but was adapted for ARMv8 instruction set and -# extensively re-tuned for all-round performance. - -my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); -my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); -my $Ktbl="x16"; -my $Xfer="x17"; -my @X = map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); -my $j=0; - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } -sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } -sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - &ushr_32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] - eval(shift(@insns)); - &sli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T4,$T7,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T4,$T7,32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T5,$T7,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T7,$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_u32 ($T3,$T7,32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T6,@X[0],$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T7,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T6,@X[0],32-$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T5,@X[0],$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T6); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T5,@X[0],32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl], #16"); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T5); - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dhi($T5), &Dlo($T7)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - while($#insns>=1) { eval(shift(@insns)); } - &st1_32 ("{$T0}","[$Xfer], #16"); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_8 ("{@X[0]}","[$inp],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &rev32 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &st1_32 ("{$T0}","[$Xfer], #16"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past - '&and ($t1,$f,$e)', - '&bic ($t4,$g,$e)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&orr ($t1,$t1,$t4)', # Ch(e,f,g) - '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ror ($t0,$t0,"#$Sigma1[0]")', - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t0)', # h+=Sigma1(e) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&ror ($t4,$t4,"#$Sigma0[0]")', - '&add ($d,$d,$h)', # d+=h - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -#ifdef __KERNEL__ -.globl sha256_block_neon -#endif -.type sha256_block_neon,%function -.align 4 -sha256_block_neon: -.Lneon_entry: - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr $Ktbl,.LK256 - add $num,$inp,$num,lsl#6 // len to point at the end of inp - - ld1.8 {@X[0]},[$inp], #16 - ld1.8 {@X[1]},[$inp], #16 - ld1.8 {@X[2]},[$inp], #16 - ld1.8 {@X[3]},[$inp], #16 - ld1.32 {$T0},[$Ktbl], #16 - ld1.32 {$T1},[$Ktbl], #16 - ld1.32 {$T2},[$Ktbl], #16 - ld1.32 {$T3},[$Ktbl], #16 - rev32 @X[0],@X[0] // yes, even on - rev32 @X[1],@X[1] // big-endian - rev32 @X[2],@X[2] - rev32 @X[3],@X[3] - mov $Xfer,sp - add.32 $T0,$T0,@X[0] - add.32 $T1,$T1,@X[1] - add.32 $T2,$T2,@X[2] - st1.32 {$T0-$T1},[$Xfer], #32 - add.32 $T3,$T3,@X[3] - st1.32 {$T2-$T3},[$Xfer] - sub $Xfer,$Xfer,#32 - - ldp $A,$B,[$ctx] - ldp $C,$D,[$ctx,#8] - ldp $E,$F,[$ctx,#16] - ldp $G,$H,[$ctx,#24] - ldr $t1,[sp,#0] - mov $t2,wzr - eor $t3,$B,$C - mov $t4,wzr - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - cmp $t1,#0 // check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - sub $Ktbl,$Ktbl,#256 // rewind $Ktbl - cmp $inp,$num - mov $Xfer, #64 - csel $Xfer, $Xfer, xzr, eq - sub $inp,$inp,$Xfer // avoid SEGV - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - add $A,$A,$t4 // h+=Sigma0(a) from the past - ldp $t0,$t1,[$ctx,#0] - add $A,$A,$t2 // h+=Maj(a,b,c) from the past - ldp $t2,$t3,[$ctx,#8] - add $A,$A,$t0 // accumulate - add $B,$B,$t1 - ldp $t0,$t1,[$ctx,#16] - add $C,$C,$t2 - add $D,$D,$t3 - ldp $t2,$t3,[$ctx,#24] - add $E,$E,$t0 - add $F,$F,$t1 - ldr $t1,[sp,#0] - stp $A,$B,[$ctx,#0] - add $G,$G,$t2 - mov $t2,wzr - stp $C,$D,[$ctx,#8] - add $H,$H,$t3 - stp $E,$F,[$ctx,#16] - eor $t3,$B,$C - stp $G,$H,[$ctx,#24] - mov $t4,wzr - mov $Xfer,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret -.size sha256_block_neon,.-sha256_block_neon -___ -} - -$code.=<<___; -#ifndef __KERNEL__ -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -{ my %opcode = ( - "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, - "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o - && - sprintf ".inst\t0x%08x\t//%s %s", - $opcode{$mnemonic}|$1|($2<<5)|($3<<16), - $mnemonic,$arg; - } -} - -open SELF,$0; -while() { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach(split("\n",$code)) { - - s/\`([^\`]*)\`/eval($1)/ge; - - s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; - - s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers - - s/\.[ui]?8(\s)/$1/; - s/\.\w?32\b// and s/\.16b/\.4s/g; - m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; - - print $_,"\n"; -} - -close STDOUT; diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S deleted file mode 100644 index f3e21c6d87d2..000000000000 --- a/arch/arm64/lib/crypto/sha256-ce.S +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions - * - * Copyright (C) 2014 Linaro Ltd - */ - -#include -#include - - .text - .arch armv8-a+crypto - - dga .req q20 - dgav .req v20 - dgb .req q21 - dgbv .req v21 - - t0 .req v22 - t1 .req v23 - - dg0q .req q24 - dg0v .req v24 - dg1q .req q25 - dg1v .req v25 - dg2q .req q26 - dg2v .req v26 - - .macro add_only, ev, rc, s0 - mov dg2v.16b, dg0v.16b - .ifeq \ev - add t1.4s, v\s0\().4s, \rc\().4s - sha256h dg0q, dg1q, t0.4s - sha256h2 dg1q, dg2q, t0.4s - .else - .ifnb \s0 - add t0.4s, v\s0\().4s, \rc\().4s - .endif - sha256h dg0q, dg1q, t1.4s - sha256h2 dg1q, dg2q, t1.4s - .endif - .endm - - .macro add_update, ev, rc, s0, s1, s2, s3 - sha256su0 v\s0\().4s, v\s1\().4s - add_only \ev, \rc, \s1 - sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s - .endm - - /* - * The SHA-256 round constants - */ - .section ".rodata", "a" - .align 4 -.Lsha2_rcon: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - - /* - * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ - .text -SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] - - /* load state */ - ld1 {dgav.4s, dgbv.4s}, [x0] - - /* load input */ -0: ld1 {v16.4s-v19.4s}, [x1], #64 - sub x2, x2, #1 - -CPU_LE( rev32 v16.16b, v16.16b ) -CPU_LE( rev32 v17.16b, v17.16b ) -CPU_LE( rev32 v18.16b, v18.16b ) -CPU_LE( rev32 v19.16b, v19.16b ) - - add t0.4s, v16.4s, v0.4s - mov dg0v.16b, dgav.16b - mov dg1v.16b, dgbv.16b - - add_update 0, v1, 16, 17, 18, 19 - add_update 1, v2, 17, 18, 19, 16 - add_update 0, v3, 18, 19, 16, 17 - add_update 1, v4, 19, 16, 17, 18 - - add_update 0, v5, 16, 17, 18, 19 - add_update 1, v6, 17, 18, 19, 16 - add_update 0, v7, 18, 19, 16, 17 - add_update 1, v8, 19, 16, 17, 18 - - add_update 0, v9, 16, 17, 18, 19 - add_update 1, v10, 17, 18, 19, 16 - add_update 0, v11, 18, 19, 16, 17 - add_update 1, v12, 19, 16, 17, 18 - - add_only 0, v13, 17 - add_only 1, v14, 18 - add_only 0, v15, 19 - add_only 1 - - /* update state */ - add dgav.4s, dgav.4s, dg0v.4s - add dgbv.4s, dgbv.4s, dg1v.4s - - /* return early if voluntary preemption is needed */ - cond_yield 1f, x5, x6 - - /* handled all input blocks? */ - cbnz x2, 0b - - /* store new state */ -1: st1 {dgav.4s, dgbv.4s}, [x0] - mov x0, x2 - ret -SYM_FUNC_END(__sha256_ce_transform) diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c deleted file mode 100644 index bcf7a3adc0c4..000000000000 --- a/arch/arm64/lib/crypto/sha256.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for ARM64 - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include - -asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -EXPORT_SYMBOL_GPL(sha256_blocks_arch); -asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { - if (static_branch_likely(&have_ce)) { - do { - size_t rem; - - kernel_neon_begin(); - rem = __sha256_ce_transform(state, - data, nblocks); - kernel_neon_end(); - data += (nblocks - rem) * SHA256_BLOCK_SIZE; - nblocks = rem; - } while (nblocks); - } else { - kernel_neon_begin(); - sha256_block_neon(state, data, nblocks); - kernel_neon_end(); - } - } else { - sha256_blocks_arch(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -bool sha256_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_arm64_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - cpu_have_named_feature(ASIMD)) { - static_branch_enable(&have_neon); - if (cpu_have_named_feature(SHA2)) - static_branch_enable(&have_ce); - } - return 0; -} -subsys_initcall(sha256_arm64_mod_init); - -static void __exit sha256_arm64_mod_exit(void) -{ -} -module_exit(sha256_arm64_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for ARM64"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index e14bef8e87af..fdeb91bf0032 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -193,7 +193,7 @@ if ARM source "lib/crypto/arm/Kconfig" endif if ARM64 -source "arch/arm64/lib/crypto/Kconfig" +source "lib/crypto/arm64/Kconfig" endif if MIPS source "arch/mips/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 5f2b81f82a85..19e9880c5d5f 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -87,7 +87,7 @@ endif ifeq ($(CONFIG_ARM64),y) libsha512-y += arm64/sha512-core.o -$(obj)/arm64/sha512-core.S: $(src)/../../arch/arm64/lib/crypto/sha2-armv8.pl +$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl $(call cmd,perlasm_with_args) clean-files += arm64/sha512-core.S libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o @@ -108,3 +108,4 @@ obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ +obj-$(CONFIG_ARM64) += arm64/ diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore index 670a4d97b568..f6c4e8ef80da 100644 --- a/lib/crypto/arm64/.gitignore +++ b/lib/crypto/arm64/.gitignore @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S sha512-core.S diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig new file mode 100644 index 000000000000..129a7685cb4c --- /dev/null +++ b/lib/crypto/arm64/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_NEON + tristate + depends on KERNEL_MODE_NEON + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_NEON + tristate + depends on KERNEL_MODE_NEON + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + +config CRYPTO_SHA256_ARM64 + tristate + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile new file mode 100644 index 000000000000..946c09903711 --- /dev/null +++ b/lib/crypto/arm64/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o +poly1305-neon-y := poly1305-core.o poly1305-glue.o +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch +AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch + +obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o +sha256-arm64-y := sha256.o sha256-core.o +sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) void $(@) + +$(obj)/%-core.S: $(src)/%-armv8.pl + $(call cmd,perlasm) + +$(obj)/sha256-core.S: $(src)/sha2-armv8.pl + $(call cmd,perlasm) + +clean-files += poly1305-core.S sha256-core.S diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S new file mode 100644 index 000000000000..80079586ecc7 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-core.S @@ -0,0 +1,805 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016-2018 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Originally based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include + + .text + .align 6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + + adr_l x10, ROT8 + ld1 {v12.4s}, [x10] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs w3, w3, #2 + b.ne .Ldoubleround + + ret +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + // w3: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha_permute + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(chacha_block_xor_neon) + +SYM_FUNC_START(hchacha_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + // w2: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + mov w3, w2 + bl chacha_permute + + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(hchacha_block_neon) + + a0 .req w12 + a1 .req w13 + a2 .req w14 + a3 .req w15 + a4 .req w16 + a5 .req w17 + a6 .req w19 + a7 .req w20 + a8 .req w21 + a9 .req w22 + a10 .req w23 + a11 .req w24 + a12 .req w25 + a13 .req w26 + a14 .req w27 + a15 .req w28 + + .align 6 +SYM_FUNC_START(chacha_4block_xor_neon) + frame_push 10 + + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // + adr_l x9, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x9] + + // x0..15[0-3] = s0..3[0..3] + add x8, x0, #16 + ld4r { v0.4s- v3.4s}, [x0] + ld4r { v4.4s- v7.4s}, [x8], #16 + ld4r { v8.4s-v11.4s}, [x8], #16 + ld4r {v12.4s-v15.4s}, [x8] + + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 + add v12.4s, v12.4s, v30.4s + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + rev32 v15.8h, v15.8h + ror a15, a15, #16 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + ror a4, a4, #20 + sri v5.4s, v17.4s, #20 + ror a5, a5, #20 + sri v6.4s, v18.4s, #20 + ror a6, a6, #20 + sri v7.4s, v19.4s, #20 + ror a7, a7, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + ror a4, a4, #25 + sri v5.4s, v17.4s, #25 + ror a5, a5, #25 + sri v6.4s, v18.4s, #25 + ror a6, a6, #25 + sri v7.4s, v19.4s, #25 + ror a7, a7, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + rev32 v15.8h, v15.8h + ror a15, a15, #16 + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + ror a5, a5, #20 + sri v6.4s, v17.4s, #20 + ror a6, a6, #20 + sri v7.4s, v18.4s, #20 + ror a7, a7, #20 + sri v4.4s, v19.4s, #20 + ror a4, a4, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + ror a5, a5, #25 + sri v6.4s, v17.4s, #25 + ror a6, a6, #25 + sri v7.4s, v18.4s, #25 + ror a7, a7, #25 + sri v4.4s, v19.4s, #25 + ror a4, a4, #25 + + subs w3, w3, #2 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + mov w6, v16.s[0] + mov w7, v17.s[0] + add v1.4s, v1.4s, v17.4s + mov w8, v18.s[0] + mov w9, v19.s[0] + add v2.4s, v2.4s, v18.4s + add a0, a0, w6 + add a1, a1, w7 + add v3.4s, v3.4s, v19.4s + add a2, a2, w8 + add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + mov w6, v20.s[0] + mov w7, v21.s[0] + add v5.4s, v5.4s, v21.4s + mov w8, v22.s[0] + mov w9, v23.s[0] + add v6.4s, v6.4s, v22.4s + add a4, a4, w6 + add a5, a5, w7 + add v7.4s, v7.4s, v23.4s + add a6, a6, w8 + add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + mov w6, v24.s[0] + mov w7, v25.s[0] + add v9.4s, v9.4s, v25.4s + mov w8, v26.s[0] + mov w9, v27.s[0] + add v10.4s, v10.4s, v26.4s + add a8, a8, w6 + add a9, a9, w7 + add v11.4s, v11.4s, v27.4s + add a10, a10, w8 + add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + mov w6, v28.s[0] + mov w7, v29.s[0] + add v13.4s, v13.4s, v29.4s + mov w8, v30.s[0] + mov w9, v31.s[0] + add v14.4s, v14.4s, v30.4s + add a12, a12, w6 + add a13, a13, w7 + add v15.4s, v15.4s, v31.4s + add a14, a14, w8 + add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) + + // interleave 32-bit words in state n, n+1 + ldp w6, w7, [x2], #64 + zip1 v16.4s, v0.4s, v1.4s + ldp w8, w9, [x2, #-56] + eor a0, a0, w6 + zip2 v17.4s, v0.4s, v1.4s + eor a1, a1, w7 + zip1 v18.4s, v2.4s, v3.4s + eor a2, a2, w8 + zip2 v19.4s, v2.4s, v3.4s + eor a3, a3, w9 + ldp w6, w7, [x2, #-48] + zip1 v20.4s, v4.4s, v5.4s + ldp w8, w9, [x2, #-40] + eor a4, a4, w6 + zip2 v21.4s, v4.4s, v5.4s + eor a5, a5, w7 + zip1 v22.4s, v6.4s, v7.4s + eor a6, a6, w8 + zip2 v23.4s, v6.4s, v7.4s + eor a7, a7, w9 + ldp w6, w7, [x2, #-32] + zip1 v24.4s, v8.4s, v9.4s + ldp w8, w9, [x2, #-24] + eor a8, a8, w6 + zip2 v25.4s, v8.4s, v9.4s + eor a9, a9, w7 + zip1 v26.4s, v10.4s, v11.4s + eor a10, a10, w8 + zip2 v27.4s, v10.4s, v11.4s + eor a11, a11, w9 + ldp w6, w7, [x2, #-16] + zip1 v28.4s, v12.4s, v13.4s + ldp w8, w9, [x2, #-8] + eor a12, a12, w6 + zip2 v29.4s, v12.4s, v13.4s + eor a13, a13, w7 + zip1 v30.4s, v14.4s, v15.4s + eor a14, a14, w8 + zip2 v31.4s, v14.4s, v15.4s + eor a15, a15, w9 + + add x3, x2, x4 + sub x3, x3, #128 // start of last block + + subs x5, x4, #128 + csel x2, x2, x3, ge + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + stp a0, a1, [x1], #64 + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + stp a2, a3, [x1, #-56] + + subs x6, x4, #192 + ld1 {v16.16b-v19.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + stp a4, a5, [x1, #-48] + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + stp a6, a7, [x1, #-40] + + subs x7, x4, #256 + ld1 {v20.16b-v23.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + stp a8, a9, [x1, #-32] + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + stp a10, a11, [x1, #-24] + + subs x8, x4, #320 + ld1 {v24.16b-v27.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + stp a12, a13, [x1, #-16] + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + stp a14, a15, [x1, #-8] + + tbnz x5, #63, .Lt128 + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + + tbnz x6, #63, .Lt192 + + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + + st1 {v16.16b-v19.16b}, [x1], #64 + tbnz x7, #63, .Lt256 + + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + + st1 {v20.16b-v23.16b}, [x1], #64 + tbnz x8, #63, .Lt320 + + eor v28.16b, v28.16b, v12.16b + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + + st1 {v24.16b-v27.16b}, [x1], #64 + st1 {v28.16b-v31.16b}, [x1] + +.Lout: frame_pop + ret + + // fewer than 192 bytes of in/output +.Lt192: cbz x5, 1f // exactly 128 bytes? + ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + tbl v28.16b, {v4.16b-v7.16b}, v28.16b + tbl v29.16b, {v4.16b-v7.16b}, v29.16b + tbl v30.16b, {v4.16b-v7.16b}, v30.16b + tbl v31.16b, {v4.16b-v7.16b}, v31.16b + +0: eor v20.16b, v20.16b, v28.16b + eor v21.16b, v21.16b, v29.16b + eor v22.16b, v22.16b, v30.16b + eor v23.16b, v23.16b, v31.16b + st1 {v20.16b-v23.16b}, [x5] // overlapping stores +1: st1 {v16.16b-v19.16b}, [x1] + b .Lout + + // fewer than 128 bytes of in/output +.Lt128: ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + sub x1, x1, #64 + tbl v28.16b, {v0.16b-v3.16b}, v28.16b + tbl v29.16b, {v0.16b-v3.16b}, v29.16b + tbl v30.16b, {v0.16b-v3.16b}, v30.16b + tbl v31.16b, {v0.16b-v3.16b}, v31.16b + ld1 {v16.16b-v19.16b}, [x1] // reload first output block + b 0b + + // fewer than 256 bytes of in/output +.Lt256: cbz x6, 2f // exactly 192 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x6, x6, x1 + tbl v0.16b, {v8.16b-v11.16b}, v4.16b + tbl v1.16b, {v8.16b-v11.16b}, v5.16b + tbl v2.16b, {v8.16b-v11.16b}, v6.16b + tbl v3.16b, {v8.16b-v11.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x6] // overlapping stores +2: st1 {v20.16b-v23.16b}, [x1] + b .Lout + + // fewer than 320 bytes of in/output +.Lt320: cbz x7, 3f // exactly 256 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x7, x7, x1 + tbl v0.16b, {v12.16b-v15.16b}, v4.16b + tbl v1.16b, {v12.16b-v15.16b}, v5.16b + tbl v2.16b, {v12.16b-v15.16b}, v6.16b + tbl v3.16b, {v12.16b-v15.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x7] // overlapping stores +3: st1 {v24.16b-v27.16b}, [x1] + b .Lout +SYM_FUNC_END(chacha_4block_xor_neon) + + .section ".rodata", "a", %progbits + .align L1_CACHE_SHIFT +.Lpermute: + .set .Li, 0 + .rept 128 + .byte (.Li - 64) + .set .Li, .Li + 1 + .endr + +CTRINC: .word 1, 2, 3, 4 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c new file mode 100644 index 000000000000..d0188f974ca5 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-glue.c @@ -0,0 +1,119 @@ +/* + * ChaCha and HChaCha functions (ARM64 optimized) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + int bytes, int nrounds) +{ + while (bytes > 0) { + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state->x[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&have_neon); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl new file mode 100644 index 000000000000..22c9069c0650 --- /dev/null +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -0,0 +1,917 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# ThunderX2 1.17/+95% 1.36 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + mov w#$s1,#-1 + stp $r0,$r1,[$ctx,#32] // save key value + str w#$s1,[$ctx,#48] // impossible key power value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + + csel $d0,$d0,$r0,eq + +# ifdef __ILP32__ + stp w#$d0,w#$d1,[$len] +# else + stp $d0,$d1,[$len] +# endif +#endif + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] + ldp $r0,$r1,[$ctx,#32] // load key value + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp x17,#0 // is_base2_26? + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __AARCH64EB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] + ldp $t0,$t1,[$nonce] // load nonce + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp $r0,#0 // is_base2_26? + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 4 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +#endif +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.lo .Lpoly1305_blocks + + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + b .Leven_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + ldr w17,[$ctx,#48] // first table element + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cmp w17,#-1 // is value impossible? + b.ne .Leven_neon + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$inp,#32] // inp[2:3] + subs $len,$len,#64 + ldp x9,x13,[$inp,#48] + add $in2,$inp,#96 + adrp $zeros,.Lzeros + add $zeros,$zeros,#:lo12:.Lzeros + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + ldr x30,[sp,#8] + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + mov x4,#1 + st1 {$ACC4}[0],[$ctx] + str x4,[$ctx,#8] // set is_base2_26 + + ldr x29,[sp],#80 + .inst 0xd50323bf // autiasp + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.pushsection .rodata +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + +.align 2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + s/w#x([0-9]+)/w$1/g; + + print $_,"\n"; +} +close STDOUT; diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c new file mode 100644 index 000000000000..c9a74766785b --- /dev/null +++ b/lib/crypto/arm64/poly1305-glue.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + len = round_down(len, POLY1305_BLOCK_SIZE); + if (static_branch_likely(&have_neon)) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ + /* We always can use at least the ARM64 scalar implementation. */ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init neon_poly1305_mod_init(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(neon_poly1305_mod_init); + +static void __exit neon_poly1305_mod_exit(void) +{ +} +module_exit(neon_poly1305_mod_exit); + +MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl new file mode 100644 index 000000000000..4aebd20c498b --- /dev/null +++ b/lib/crypto/arm64/sha2-armv8.pl @@ -0,0 +1,786 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significantly faster +# and the gap is only 40-90%. +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open STDOUT,">$output"; +} + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_blocks_arch"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __AARCH64EB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +___ +} + +$code.=<<___; +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S new file mode 100644 index 000000000000..f3e21c6d87d2 --- /dev/null +++ b/lib/crypto/arm64/sha256-ce.S @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd + */ + +#include +#include + + .text + .arch armv8-a+crypto + + dga .req q20 + dgav .req v20 + dgb .req q21 + dgbv .req v21 + + t0 .req v22 + t1 .req v23 + + dg0q .req q24 + dg0v .req v24 + dg1q .req q25 + dg1v .req v25 + dg2q .req q26 + dg2v .req v26 + + .macro add_only, ev, rc, s0 + mov dg2v.16b, dg0v.16b + .ifeq \ev + add t1.4s, v\s0\().4s, \rc\().4s + sha256h dg0q, dg1q, t0.4s + sha256h2 dg1q, dg2q, t0.4s + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha256h dg0q, dg1q, t1.4s + sha256h2 dg1q, dg2q, t1.4s + .endif + .endm + + .macro add_update, ev, rc, s0, s1, s2, s3 + sha256su0 v\s0\().4s, v\s1\().4s + add_only \ev, \rc, \s1 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s + .endm + + /* + * The SHA-256 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha2_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + * const u8 *data, size_t nblocks); + */ + .text +SYM_FUNC_START(__sha256_ce_transform) + /* load round constants */ + adr_l x8, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [x8], #64 + ld1 { v4.4s- v7.4s}, [x8], #64 + ld1 { v8.4s-v11.4s}, [x8], #64 + ld1 {v12.4s-v15.4s}, [x8] + + /* load state */ + ld1 {dgav.4s, dgbv.4s}, [x0] + + /* load input */ +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) + + add t0.4s, v16.4s, v0.4s + mov dg0v.16b, dgav.16b + mov dg1v.16b, dgbv.16b + + add_update 0, v1, 16, 17, 18, 19 + add_update 1, v2, 17, 18, 19, 16 + add_update 0, v3, 18, 19, 16, 17 + add_update 1, v4, 19, 16, 17, 18 + + add_update 0, v5, 16, 17, 18, 19 + add_update 1, v6, 17, 18, 19, 16 + add_update 0, v7, 18, 19, 16, 17 + add_update 1, v8, 19, 16, 17, 18 + + add_update 0, v9, 16, 17, 18, 19 + add_update 1, v10, 17, 18, 19, 16 + add_update 0, v11, 18, 19, 16, 17 + add_update 1, v12, 19, 16, 17, 18 + + add_only 0, v13, 17 + add_only 1, v14, 18 + add_only 0, v15, 19 + add_only 1 + + /* update state */ + add dgav.4s, dgav.4s, dg0v.4s + add dgbv.4s, dgbv.4s, dg1v.4s + + /* return early if voluntary preemption is needed */ + cond_yield 1f, x5, x6 + + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +1: st1 {dgav.4s, dgbv.4s}, [x0] + mov x0, x2 + ret +SYM_FUNC_END(__sha256_ce_transform) diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c new file mode 100644 index 000000000000..bcf7a3adc0c4 --- /dev/null +++ b/lib/crypto/arm64/sha256.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include + +asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +EXPORT_SYMBOL_GPL(sha256_blocks_arch); +asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon)) { + if (static_branch_likely(&have_ce)) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha256_ce_transform(state, + data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA256_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + kernel_neon_begin(); + sha256_block_neon(state, data, nblocks); + kernel_neon_end(); + } + } else { + sha256_blocks_arch(state, data, nblocks); + } +} +EXPORT_SYMBOL_GPL(sha256_blocks_simd); + +bool sha256_is_arch_optimized(void) +{ + /* We always can use at least the ARM64 scalar implementation. */ + return true; +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_arm64_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_neon); + if (cpu_have_named_feature(SHA2)) + static_branch_enable(&have_ce); + } + return 0; +} +subsys_initcall(sha256_arm64_mod_init); + +static void __exit sha256_arm64_mod_exit(void) +{ +} +module_exit(sha256_arm64_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 optimized for ARM64"); -- cgit v1.2.3 From 7e54e993ab8c98d912f54ad6f46bfcc9dcd65368 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:02 -0700 Subject: lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/ Move the contents of arch/mips/lib/crypto/ into lib/crypto/mips/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/mips/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/lib/.gitignore | 4 + arch/mips/lib/Makefile | 2 - arch/mips/lib/crypto/.gitignore | 2 - arch/mips/lib/crypto/Kconfig | 12 - arch/mips/lib/crypto/Makefile | 19 - arch/mips/lib/crypto/chacha-core.S | 497 ------------- arch/mips/lib/crypto/chacha-glue.c | 29 - arch/mips/lib/crypto/poly1305-glue.c | 33 - arch/mips/lib/crypto/poly1305-mips.pl | 1273 --------------------------------- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/mips/.gitignore | 2 + lib/crypto/mips/Kconfig | 12 + lib/crypto/mips/Makefile | 19 + lib/crypto/mips/chacha-core.S | 497 +++++++++++++ lib/crypto/mips/chacha-glue.c | 29 + lib/crypto/mips/poly1305-glue.c | 33 + lib/crypto/mips/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++++++ 18 files changed, 1871 insertions(+), 1868 deletions(-) create mode 100644 arch/mips/lib/.gitignore delete mode 100644 arch/mips/lib/crypto/.gitignore delete mode 100644 arch/mips/lib/crypto/Kconfig delete mode 100644 arch/mips/lib/crypto/Makefile delete mode 100644 arch/mips/lib/crypto/chacha-core.S delete mode 100644 arch/mips/lib/crypto/chacha-glue.c delete mode 100644 arch/mips/lib/crypto/poly1305-glue.c delete mode 100644 arch/mips/lib/crypto/poly1305-mips.pl create mode 100644 lib/crypto/mips/.gitignore create mode 100644 lib/crypto/mips/Kconfig create mode 100644 lib/crypto/mips/Makefile create mode 100644 lib/crypto/mips/chacha-core.S create mode 100644 lib/crypto/mips/chacha-glue.c create mode 100644 lib/crypto/mips/poly1305-glue.c create mode 100644 lib/crypto/mips/poly1305-mips.pl diff --git a/arch/mips/lib/.gitignore b/arch/mips/lib/.gitignore new file mode 100644 index 000000000000..647d7a922e68 --- /dev/null +++ b/arch/mips/lib/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 9d75845ef78e..9c024e6d5e54 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for MIPS-specific library files.. # -obj-y += crypto/ - lib-y += bitops.o csum_partial.o delay.o memcpy.o memset.o \ mips-atomic.o strncpy_user.o \ strnlen_user.o uncached.o diff --git a/arch/mips/lib/crypto/.gitignore b/arch/mips/lib/crypto/.gitignore deleted file mode 100644 index 0d47d4f21c6d..000000000000 --- a/arch/mips/lib/crypto/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-core.S diff --git a/arch/mips/lib/crypto/Kconfig b/arch/mips/lib/crypto/Kconfig deleted file mode 100644 index 0670a170c1be..000000000000 --- a/arch/mips/lib/crypto/Kconfig +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_MIPS - tristate - depends on CPU_MIPS32_R2 - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_MIPS - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/arch/mips/lib/crypto/Makefile b/arch/mips/lib/crypto/Makefile deleted file mode 100644 index 804488c7aded..000000000000 --- a/arch/mips/lib/crypto/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o -chacha-mips-y := chacha-core.o chacha-glue.o -AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots - -obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -poly1305-mips-y := poly1305-core.o poly1305-glue.o - -perlasm-flavour-$(CONFIG_32BIT) := o32 -perlasm-flavour-$(CONFIG_64BIT) := 64 - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) - -$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE - $(call if_changed,perlasm) - -targets += poly1305-core.S diff --git a/arch/mips/lib/crypto/chacha-core.S b/arch/mips/lib/crypto/chacha-core.S deleted file mode 100644 index 5755f69cfe00..000000000000 --- a/arch/mips/lib/crypto/chacha-core.S +++ /dev/null @@ -1,497 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2016-2018 René van Dorst . All Rights Reserved. - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#define MASK_U32 0x3c -#define CHACHA20_BLOCK_SIZE 64 -#define STACK_SIZE 32 - -#define X0 $t0 -#define X1 $t1 -#define X2 $t2 -#define X3 $t3 -#define X4 $t4 -#define X5 $t5 -#define X6 $t6 -#define X7 $t7 -#define X8 $t8 -#define X9 $t9 -#define X10 $v1 -#define X11 $s6 -#define X12 $s5 -#define X13 $s4 -#define X14 $s3 -#define X15 $s2 -/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ -#define T0 $s1 -#define T1 $s0 -#define T(n) T ## n -#define X(n) X ## n - -/* Input arguments */ -#define STATE $a0 -#define OUT $a1 -#define IN $a2 -#define BYTES $a3 - -/* Output argument */ -/* NONCE[0] is kept in a register and not in memory. - * We don't want to touch original value in memory. - * Must be incremented every loop iteration. - */ -#define NONCE_0 $v0 - -/* SAVED_X and SAVED_CA are set in the jump table. - * Use regs which are overwritten on exit else we don't leak clear data. - * They are used to handling the last bytes which are not multiple of 4. - */ -#define SAVED_X X15 -#define SAVED_CA $s7 - -#define IS_UNALIGNED $s7 - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define MSB 0 -#define LSB 3 -#define ROTx rotl -#define ROTR(n) rotr n, 24 -#define CPU_TO_LE32(n) \ - wsbh n; \ - rotr n, 16; -#else -#define MSB 3 -#define LSB 0 -#define ROTx rotr -#define CPU_TO_LE32(n) -#define ROTR(n) -#endif - -#define FOR_EACH_WORD(x) \ - x( 0); \ - x( 1); \ - x( 2); \ - x( 3); \ - x( 4); \ - x( 5); \ - x( 6); \ - x( 7); \ - x( 8); \ - x( 9); \ - x(10); \ - x(11); \ - x(12); \ - x(13); \ - x(14); \ - x(15); - -#define FOR_EACH_WORD_REV(x) \ - x(15); \ - x(14); \ - x(13); \ - x(12); \ - x(11); \ - x(10); \ - x( 9); \ - x( 8); \ - x( 7); \ - x( 6); \ - x( 5); \ - x( 4); \ - x( 3); \ - x( 2); \ - x( 1); \ - x( 0); - -#define PLUS_ONE_0 1 -#define PLUS_ONE_1 2 -#define PLUS_ONE_2 3 -#define PLUS_ONE_3 4 -#define PLUS_ONE_4 5 -#define PLUS_ONE_5 6 -#define PLUS_ONE_6 7 -#define PLUS_ONE_7 8 -#define PLUS_ONE_8 9 -#define PLUS_ONE_9 10 -#define PLUS_ONE_10 11 -#define PLUS_ONE_11 12 -#define PLUS_ONE_12 13 -#define PLUS_ONE_13 14 -#define PLUS_ONE_14 15 -#define PLUS_ONE_15 16 -#define PLUS_ONE(x) PLUS_ONE_ ## x -#define _CONCAT3(a,b,c) a ## b ## c -#define CONCAT3(a,b,c) _CONCAT3(a,b,c) - -#define STORE_UNALIGNED(x) \ -CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lwl T1, (x*4)+MSB ## (IN); \ - lwr T1, (x*4)+LSB ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - swl X ## x, (x*4)+MSB ## (OUT); \ - swr X ## x, (x*4)+LSB ## (OUT); - -#define STORE_ALIGNED(x) \ -CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lw T1, (x*4) ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - sw X ## x, (x*4) ## (OUT); - -/* Jump table macro. - * Used for setup and handling the last bytes, which are not multiple of 4. - * X15 is free to store Xn - * Every jumptable entry must be equal in size. - */ -#define JMPTBL_ALIGNED(x) \ -.Lchacha_mips_jmptbl_aligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha_mips_xor_aligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define JMPTBL_UNALIGNED(x) \ -.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ - addu X(A), X(K); \ - addu X(B), X(L); \ - addu X(C), X(M); \ - addu X(D), X(N); \ - xor X(V), X(A); \ - xor X(W), X(B); \ - xor X(Y), X(C); \ - xor X(Z), X(D); \ - rotl X(V), S; \ - rotl X(W), S; \ - rotl X(Y), S; \ - rotl X(Z), S; - -.text -.set reorder -.set noat -.globl chacha_crypt_arch -.ent chacha_crypt_arch -chacha_crypt_arch: - .frame $sp, STACK_SIZE, $ra - - /* Load number of rounds */ - lw $at, 16($sp) - - addiu $sp, -STACK_SIZE - - /* Return bytes = 0. */ - beqz BYTES, .Lchacha_mips_end - - lw NONCE_0, 48(STATE) - - /* Save s0-s7 */ - sw $s0, 0($sp) - sw $s1, 4($sp) - sw $s2, 8($sp) - sw $s3, 12($sp) - sw $s4, 16($sp) - sw $s5, 20($sp) - sw $s6, 24($sp) - sw $s7, 28($sp) - - /* Test IN or OUT is unaligned. - * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 - */ - or IS_UNALIGNED, IN, OUT - andi IS_UNALIGNED, 0x3 - - b .Lchacha_rounds_start - -.align 4 -.Loop_chacha_rounds: - addiu IN, CHACHA20_BLOCK_SIZE - addiu OUT, CHACHA20_BLOCK_SIZE - addiu NONCE_0, 1 - -.Lchacha_rounds_start: - lw X0, 0(STATE) - lw X1, 4(STATE) - lw X2, 8(STATE) - lw X3, 12(STATE) - - lw X4, 16(STATE) - lw X5, 20(STATE) - lw X6, 24(STATE) - lw X7, 28(STATE) - lw X8, 32(STATE) - lw X9, 36(STATE) - lw X10, 40(STATE) - lw X11, 44(STATE) - - move X12, NONCE_0 - lw X13, 52(STATE) - lw X14, 56(STATE) - lw X15, 60(STATE) - -.Loop_chacha_xor_rounds: - addiu $at, -2 - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); - bnez $at, .Loop_chacha_xor_rounds - - addiu BYTES, -(CHACHA20_BLOCK_SIZE) - - /* Is data src/dst unaligned? Jump */ - bnez IS_UNALIGNED, .Loop_chacha_unaligned - - /* Set number rounds here to fill delayslot. */ - lw $at, (STACK_SIZE+16)($sp) - - /* BYTES < 0, it has no full block. */ - bltz BYTES, .Lchacha_mips_no_full_block_aligned - - FOR_EACH_WORD_REV(STORE_ALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha_rounds - - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - - /* BYTES < 0? Handle last bytes */ - bltz BYTES, .Lchacha_mips_xor_bytes - -.Lchacha_mips_xor_done: - /* Restore used registers */ - lw $s0, 0($sp) - lw $s1, 4($sp) - lw $s2, 8($sp) - lw $s3, 12($sp) - lw $s4, 16($sp) - lw $s5, 20($sp) - lw $s6, 24($sp) - lw $s7, 28($sp) - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - -.Lchacha_mips_end: - addiu $sp, STACK_SIZE - jr $ra - -.Lchacha_mips_no_full_block_aligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_ALIGNED) - - -.Loop_chacha_unaligned: - /* Set number rounds here to fill delayslot. */ - lw $at, (STACK_SIZE+16)($sp) - - /* BYTES > 0, it has no full block. */ - bltz BYTES, .Lchacha_mips_no_full_block_unaligned - - FOR_EACH_WORD_REV(STORE_UNALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha_rounds - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - - .set noreorder - /* Fall through to byte handling */ - bgez BYTES, .Lchacha_mips_xor_done -.Lchacha_mips_xor_unaligned_0_b: -.Lchacha_mips_xor_aligned_0_b: - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - .set reorder - -.Lchacha_mips_xor_bytes: - addu IN, $at - addu OUT, $at - /* First byte */ - lbu T1, 0(IN) - addiu $at, BYTES, 1 - CPU_TO_LE32(SAVED_X) - ROTR(SAVED_X) - xor T1, SAVED_X - sb T1, 0(OUT) - beqz $at, .Lchacha_mips_xor_done - /* Second byte */ - lbu T1, 1(IN) - addiu $at, BYTES, 2 - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 1(OUT) - beqz $at, .Lchacha_mips_xor_done - /* Third byte */ - lbu T1, 2(IN) - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 2(OUT) - b .Lchacha_mips_xor_done - -.Lchacha_mips_no_full_block_unaligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_UNALIGNED) -.end chacha_crypt_arch -.set at - -/* Input arguments - * STATE $a0 - * OUT $a1 - * NROUND $a2 - */ - -#undef X12 -#undef X13 -#undef X14 -#undef X15 - -#define X12 $a3 -#define X13 $at -#define X14 $v0 -#define X15 STATE - -.set noat -.globl hchacha_block_arch -.ent hchacha_block_arch -hchacha_block_arch: - .frame $sp, STACK_SIZE, $ra - - addiu $sp, -STACK_SIZE - - /* Save X11(s6) */ - sw X11, 0($sp) - - lw X0, 0(STATE) - lw X1, 4(STATE) - lw X2, 8(STATE) - lw X3, 12(STATE) - lw X4, 16(STATE) - lw X5, 20(STATE) - lw X6, 24(STATE) - lw X7, 28(STATE) - lw X8, 32(STATE) - lw X9, 36(STATE) - lw X10, 40(STATE) - lw X11, 44(STATE) - lw X12, 48(STATE) - lw X13, 52(STATE) - lw X14, 56(STATE) - lw X15, 60(STATE) - -.Loop_hchacha_xor_rounds: - addiu $a2, -2 - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); - bnez $a2, .Loop_hchacha_xor_rounds - - /* Restore used register */ - lw X11, 0($sp) - - sw X0, 0(OUT) - sw X1, 4(OUT) - sw X2, 8(OUT) - sw X3, 12(OUT) - sw X12, 16(OUT) - sw X13, 20(OUT) - sw X14, 24(OUT) - sw X15, 28(OUT) - - addiu $sp, STACK_SIZE - jr $ra -.end hchacha_block_arch -.set at diff --git a/arch/mips/lib/crypto/chacha-glue.c b/arch/mips/lib/crypto/chacha-glue.c deleted file mode 100644 index 88c097594eb0..000000000000 --- a/arch/mips/lib/crypto/chacha-glue.c +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (MIPS optimized) - * - * Copyright (C) 2019 Linaro, Ltd. - */ - -#include -#include -#include - -asmlinkage void chacha_crypt_arch(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -EXPORT_SYMBOL(chacha_crypt_arch); - -asmlinkage void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -EXPORT_SYMBOL(hchacha_block_arch); - -bool chacha_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/mips/lib/crypto/poly1305-glue.c b/arch/mips/lib/crypto/poly1305-glue.c deleted file mode 100644 index 764a38a65200..000000000000 --- a/arch/mips/lib/crypto/poly1305-glue.c +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/mips/lib/crypto/poly1305-mips.pl b/arch/mips/lib/crypto/poly1305-mips.pl deleted file mode 100644 index 399f10c3e385..000000000000 --- a/arch/mips/lib/crypto/poly1305-mips.pl +++ /dev/null @@ -1,1273 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL -# project. -# ==================================================================== - -# Poly1305 hash for MIPS. -# -# May 2016 -# -# Numbers are cycles per processed byte with poly1305_blocks alone. -# -# IALU/gcc -# R1x000 ~5.5/+130% (big-endian) -# Octeon II 2.50/+70% (little-endian) -# -# March 2019 -# -# Add 32-bit code path. -# -# October 2019 -# -# Modulo-scheduling reduction allows to omit dependency chain at the -# end of inner loop and improve performance. Also optimize MIPS32R2 -# code path for MIPS 1004K core. Per René von Dorst's suggestions. -# -# IALU/gcc -# R1x000 ~9.8/? (big-endian) -# Octeon II 3.65/+140% (little-endian) -# MT7621/1004K 4.75/? (little-endian) -# -###################################################################### -# There is a number of MIPS ABI in use, O32 and N32/64 are most -# widely used. Then there is a new contender: NUBI. It appears that if -# one picks the latter, it's possible to arrange code in ABI neutral -# manner. Therefore let's stick to NUBI register layout: -# -($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); -($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); -($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); -# -# The return value is placed in $a0. Following coding rules facilitate -# interoperability: -# -# - never ever touch $tp, "thread pointer", former $gp [o32 can be -# excluded from the rule, because it's specified volatile]; -# - copy return value to $t0, former $v0 [or to $a0 if you're adapting -# old code]; -# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; -# -# For reference here is register layout for N32/64 MIPS ABIs: -# -# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); -# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); -# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); -# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); -# -# -# -###################################################################### - -$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 - -$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; - -if ($flavour =~ /64|n32/i) {{{ -###################################################################### -# 64-bit code path -# - -my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); - -$code.=<<___; -#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ - defined(_MIPS_ARCH_MIPS64R6)) \\ - && !defined(_MIPS_ARCH_MIPS64R2) -# define _MIPS_ARCH_MIPS64R2 -#endif - -#if defined(_MIPS_ARCH_MIPS64R6) -# define dmultu(rs,rt) -# define mflo(rd,rs,rt) dmulu rd,rs,rt -# define mfhi(rd,rs,rt) dmuhu rd,rs,rt -#else -# define dmultu(rs,rt) dmultu rs,rt -# define mflo(rd,rs,rt) mflo rd -# define mfhi(rd,rs,rt) mfhi rd -#endif - -#ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch -#endif - -#if defined(__MIPSEB__) && !defined(MIPSEB) -# define MIPSEB -#endif - -#ifdef MIPSEB -# define MSB 0 -# define LSB 7 -#else -# define MSB 7 -# define LSB 0 -#endif - -.text -.set noat -.set noreorder - -.align 5 -.globl poly1305_init -.ent poly1305_init -poly1305_init: - .frame $sp,0,$ra - .set reorder - - sd $zero,0($ctx) - sd $zero,8($ctx) - sd $zero,16($ctx) - - beqz $inp,.Lno_key - -#if defined(_MIPS_ARCH_MIPS64R6) - andi $tmp0,$inp,7 # $inp % 8 - dsubu $inp,$inp,$tmp0 # align $inp - sll $tmp0,$tmp0,3 # byte to bit offset - ld $in0,0($inp) - ld $in1,8($inp) - beqz $tmp0,.Laligned_key - ld $tmp2,16($inp) - - subu $tmp1,$zero,$tmp0 -# ifdef MIPSEB - dsllv $in0,$in0,$tmp0 - dsrlv $tmp3,$in1,$tmp1 - dsllv $in1,$in1,$tmp0 - dsrlv $tmp2,$tmp2,$tmp1 -# else - dsrlv $in0,$in0,$tmp0 - dsllv $tmp3,$in1,$tmp1 - dsrlv $in1,$in1,$tmp0 - dsllv $tmp2,$tmp2,$tmp1 -# endif - or $in0,$in0,$tmp3 - or $in1,$in1,$tmp2 -.Laligned_key: -#else - ldl $in0,0+MSB($inp) - ldl $in1,8+MSB($inp) - ldr $in0,0+LSB($inp) - ldr $in1,8+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS64R2) - dsbh $in0,$in0 # byte swap - dsbh $in1,$in1 - dshd $in0,$in0 - dshd $in1,$in1 -# else - ori $tmp0,$zero,0xFF - dsll $tmp2,$tmp0,32 - or $tmp0,$tmp2 # 0x000000FF000000FF - - and $tmp1,$in0,$tmp0 # byte swap - and $tmp3,$in1,$tmp0 - dsrl $tmp2,$in0,24 - dsrl $tmp4,$in1,24 - dsll $tmp1,24 - dsll $tmp3,24 - and $tmp2,$tmp0 - and $tmp4,$tmp0 - dsll $tmp0,8 # 0x0000FF000000FF00 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - and $tmp2,$in0,$tmp0 - and $tmp4,$in1,$tmp0 - dsrl $in0,8 - dsrl $in1,8 - dsll $tmp2,8 - dsll $tmp4,8 - and $in0,$tmp0 - and $in1,$tmp0 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - or $in0,$tmp1 - or $in1,$tmp3 - dsrl $tmp1,$in0,32 - dsrl $tmp3,$in1,32 - dsll $in0,32 - dsll $in1,32 - or $in0,$tmp1 - or $in1,$tmp3 -# endif -#endif - li $tmp0,1 - dsll $tmp0,32 # 0x0000000100000000 - daddiu $tmp0,-63 # 0x00000000ffffffc1 - dsll $tmp0,28 # 0x0ffffffc10000000 - daddiu $tmp0,-1 # 0x0ffffffc0fffffff - - and $in0,$tmp0 - daddiu $tmp0,-3 # 0x0ffffffc0ffffffc - and $in1,$tmp0 - - sd $in0,24($ctx) - dsrl $tmp0,$in1,2 - sd $in1,32($ctx) - daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) - sd $tmp0,40($ctx) - -.Lno_key: - li $v0,0 # return 0 - jr $ra -.end poly1305_init -___ -{ -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; - -my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = - ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); -my ($shr,$shl) = ($s6,$s7); # used on R6 - -$code.=<<___; -.align 5 -.globl poly1305_blocks -.ent poly1305_blocks -poly1305_blocks: - .set noreorder - dsrl $len,4 # number of complete blocks - bnez $len,poly1305_blocks_internal - nop - jr $ra - nop -.end poly1305_blocks - -.align 5 -.ent poly1305_blocks_internal -poly1305_blocks_internal: - .set noreorder -#if defined(_MIPS_ARCH_MIPS64R6) - .frame $sp,8*8,$ra - .mask $SAVED_REGS_MASK|0x000c0000,-8 - dsubu $sp,8*8 - sd $s7,56($sp) - sd $s6,48($sp) -#else - .frame $sp,6*8,$ra - .mask $SAVED_REGS_MASK,-8 - dsubu $sp,6*8 -#endif - sd $s5,40($sp) - sd $s4,32($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - sd $s3,24($sp) - sd $s2,16($sp) - sd $s1,8($sp) - sd $s0,0($sp) -___ -$code.=<<___; - .set reorder - -#if defined(_MIPS_ARCH_MIPS64R6) - andi $shr,$inp,7 - dsubu $inp,$inp,$shr # align $inp - sll $shr,$shr,3 # byte to bit offset - subu $shl,$zero,$shr -#endif - - ld $h0,0($ctx) # load hash value - ld $h1,8($ctx) - ld $h2,16($ctx) - - ld $r0,24($ctx) # load key - ld $r1,32($ctx) - ld $rs1,40($ctx) - - dsll $len,4 - daddu $len,$inp # end of buffer - b .Loop - -.align 4 -.Loop: -#if defined(_MIPS_ARCH_MIPS64R6) - ld $in0,0($inp) # load input - ld $in1,8($inp) - beqz $shr,.Laligned_inp - - ld $tmp2,16($inp) -# ifdef MIPSEB - dsllv $in0,$in0,$shr - dsrlv $tmp3,$in1,$shl - dsllv $in1,$in1,$shr - dsrlv $tmp2,$tmp2,$shl -# else - dsrlv $in0,$in0,$shr - dsllv $tmp3,$in1,$shl - dsrlv $in1,$in1,$shr - dsllv $tmp2,$tmp2,$shl -# endif - or $in0,$in0,$tmp3 - or $in1,$in1,$tmp2 -.Laligned_inp: -#else - ldl $in0,0+MSB($inp) # load input - ldl $in1,8+MSB($inp) - ldr $in0,0+LSB($inp) - ldr $in1,8+LSB($inp) -#endif - daddiu $inp,16 -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS64R2) - dsbh $in0,$in0 # byte swap - dsbh $in1,$in1 - dshd $in0,$in0 - dshd $in1,$in1 -# else - ori $tmp0,$zero,0xFF - dsll $tmp2,$tmp0,32 - or $tmp0,$tmp2 # 0x000000FF000000FF - - and $tmp1,$in0,$tmp0 # byte swap - and $tmp3,$in1,$tmp0 - dsrl $tmp2,$in0,24 - dsrl $tmp4,$in1,24 - dsll $tmp1,24 - dsll $tmp3,24 - and $tmp2,$tmp0 - and $tmp4,$tmp0 - dsll $tmp0,8 # 0x0000FF000000FF00 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - and $tmp2,$in0,$tmp0 - and $tmp4,$in1,$tmp0 - dsrl $in0,8 - dsrl $in1,8 - dsll $tmp2,8 - dsll $tmp4,8 - and $in0,$tmp0 - and $in1,$tmp0 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - or $in0,$tmp1 - or $in1,$tmp3 - dsrl $tmp1,$in0,32 - dsrl $tmp3,$in1,32 - dsll $in0,32 - dsll $in1,32 - or $in0,$tmp1 - or $in1,$tmp3 -# endif -#endif - dsrl $tmp1,$h2,2 # modulo-scheduled reduction - andi $h2,$h2,3 - dsll $tmp0,$tmp1,2 - - daddu $d0,$h0,$in0 # accumulate input - daddu $tmp1,$tmp0 - sltu $tmp0,$d0,$h0 - daddu $d0,$d0,$tmp1 # ... and residue - sltu $tmp1,$d0,$tmp1 - daddu $d1,$h1,$in1 - daddu $tmp0,$tmp1 - sltu $tmp1,$d1,$h1 - daddu $d1,$tmp0 - - dmultu ($r0,$d0) # h0*r0 - daddu $d2,$h2,$padbit - sltu $tmp0,$d1,$tmp0 - mflo ($h0,$r0,$d0) - mfhi ($h1,$r0,$d0) - - dmultu ($rs1,$d1) # h1*5*r1 - daddu $d2,$tmp1 - daddu $d2,$tmp0 - mflo ($tmp0,$rs1,$d1) - mfhi ($tmp1,$rs1,$d1) - - dmultu ($r1,$d0) # h0*r1 - mflo ($tmp2,$r1,$d0) - mfhi ($h2,$r1,$d0) - daddu $h0,$tmp0 - daddu $h1,$tmp1 - sltu $tmp0,$h0,$tmp0 - - dmultu ($r0,$d1) # h1*r0 - daddu $h1,$tmp0 - daddu $h1,$tmp2 - mflo ($tmp0,$r0,$d1) - mfhi ($tmp1,$r0,$d1) - - dmultu ($rs1,$d2) # h2*5*r1 - sltu $tmp2,$h1,$tmp2 - daddu $h2,$tmp2 - mflo ($tmp2,$rs1,$d2) - - dmultu ($r0,$d2) # h2*r0 - daddu $h1,$tmp0 - daddu $h2,$tmp1 - mflo ($tmp3,$r0,$d2) - sltu $tmp0,$h1,$tmp0 - daddu $h2,$tmp0 - - daddu $h1,$tmp2 - sltu $tmp2,$h1,$tmp2 - daddu $h2,$tmp2 - daddu $h2,$tmp3 - - bne $inp,$len,.Loop - - sd $h0,0($ctx) # store hash value - sd $h1,8($ctx) - sd $h2,16($ctx) - - .set noreorder -#if defined(_MIPS_ARCH_MIPS64R6) - ld $s7,56($sp) - ld $s6,48($sp) -#endif - ld $s5,40($sp) # epilogue - ld $s4,32($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue - ld $s3,24($sp) - ld $s2,16($sp) - ld $s1,8($sp) - ld $s0,0($sp) -___ -$code.=<<___; - jr $ra -#if defined(_MIPS_ARCH_MIPS64R6) - daddu $sp,8*8 -#else - daddu $sp,6*8 -#endif -.end poly1305_blocks_internal -___ -} -{ -my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); - -$code.=<<___; -.align 5 -.globl poly1305_emit -.ent poly1305_emit -poly1305_emit: - .frame $sp,0,$ra - .set reorder - - ld $tmp2,16($ctx) - ld $tmp0,0($ctx) - ld $tmp1,8($ctx) - - li $in0,-4 # final reduction - dsrl $in1,$tmp2,2 - and $in0,$tmp2 - andi $tmp2,$tmp2,3 - daddu $in0,$in1 - - daddu $tmp0,$tmp0,$in0 - sltu $in1,$tmp0,$in0 - daddiu $in0,$tmp0,5 # compare to modulus - daddu $tmp1,$tmp1,$in1 - sltiu $tmp3,$in0,5 - sltu $tmp4,$tmp1,$in1 - daddu $in1,$tmp1,$tmp3 - daddu $tmp2,$tmp2,$tmp4 - sltu $tmp3,$in1,$tmp3 - daddu $tmp2,$tmp2,$tmp3 - - dsrl $tmp2,2 # see if it carried/borrowed - dsubu $tmp2,$zero,$tmp2 - - xor $in0,$tmp0 - xor $in1,$tmp1 - and $in0,$tmp2 - and $in1,$tmp2 - xor $in0,$tmp0 - xor $in1,$tmp1 - - lwu $tmp0,0($nonce) # load nonce - lwu $tmp1,4($nonce) - lwu $tmp2,8($nonce) - lwu $tmp3,12($nonce) - dsll $tmp1,32 - dsll $tmp3,32 - or $tmp0,$tmp1 - or $tmp2,$tmp3 - - daddu $in0,$tmp0 # accumulate nonce - daddu $in1,$tmp2 - sltu $tmp0,$in0,$tmp0 - daddu $in1,$tmp0 - - dsrl $tmp0,$in0,8 # write mac value - dsrl $tmp1,$in0,16 - dsrl $tmp2,$in0,24 - sb $in0,0($mac) - dsrl $tmp3,$in0,32 - sb $tmp0,1($mac) - dsrl $tmp0,$in0,40 - sb $tmp1,2($mac) - dsrl $tmp1,$in0,48 - sb $tmp2,3($mac) - dsrl $tmp2,$in0,56 - sb $tmp3,4($mac) - dsrl $tmp3,$in1,8 - sb $tmp0,5($mac) - dsrl $tmp0,$in1,16 - sb $tmp1,6($mac) - dsrl $tmp1,$in1,24 - sb $tmp2,7($mac) - - sb $in1,8($mac) - dsrl $tmp2,$in1,32 - sb $tmp3,9($mac) - dsrl $tmp3,$in1,40 - sb $tmp0,10($mac) - dsrl $tmp0,$in1,48 - sb $tmp1,11($mac) - dsrl $tmp1,$in1,56 - sb $tmp2,12($mac) - sb $tmp3,13($mac) - sb $tmp0,14($mac) - sb $tmp1,15($mac) - - jr $ra -.end poly1305_emit -.rdata -.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" -.align 2 -___ -} -}}} else {{{ -###################################################################### -# 32-bit code path -# - -my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = - ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); - -$code.=<<___; -#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ - defined(_MIPS_ARCH_MIPS32R6)) \\ - && !defined(_MIPS_ARCH_MIPS32R2) -# define _MIPS_ARCH_MIPS32R2 -#endif - -#if defined(_MIPS_ARCH_MIPS32R6) -# define multu(rs,rt) -# define mflo(rd,rs,rt) mulu rd,rs,rt -# define mfhi(rd,rs,rt) muhu rd,rs,rt -#else -# define multu(rs,rt) multu rs,rt -# define mflo(rd,rs,rt) mflo rd -# define mfhi(rd,rs,rt) mfhi rd -#endif - -#ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch -#endif - -#if defined(__MIPSEB__) && !defined(MIPSEB) -# define MIPSEB -#endif - -#ifdef MIPSEB -# define MSB 0 -# define LSB 3 -#else -# define MSB 3 -# define LSB 0 -#endif - -.text -.set noat -.set noreorder - -.align 5 -.globl poly1305_init -.ent poly1305_init -poly1305_init: - .frame $sp,0,$ra - .set reorder - - sw $zero,0($ctx) - sw $zero,4($ctx) - sw $zero,8($ctx) - sw $zero,12($ctx) - sw $zero,16($ctx) - - beqz $inp,.Lno_key - -#if defined(_MIPS_ARCH_MIPS32R6) - andi $tmp0,$inp,3 # $inp % 4 - subu $inp,$inp,$tmp0 # align $inp - sll $tmp0,$tmp0,3 # byte to bit offset - lw $in0,0($inp) - lw $in1,4($inp) - lw $in2,8($inp) - lw $in3,12($inp) - beqz $tmp0,.Laligned_key - - lw $tmp2,16($inp) - subu $tmp1,$zero,$tmp0 -# ifdef MIPSEB - sllv $in0,$in0,$tmp0 - srlv $tmp3,$in1,$tmp1 - sllv $in1,$in1,$tmp0 - or $in0,$in0,$tmp3 - srlv $tmp3,$in2,$tmp1 - sllv $in2,$in2,$tmp0 - or $in1,$in1,$tmp3 - srlv $tmp3,$in3,$tmp1 - sllv $in3,$in3,$tmp0 - or $in2,$in2,$tmp3 - srlv $tmp2,$tmp2,$tmp1 - or $in3,$in3,$tmp2 -# else - srlv $in0,$in0,$tmp0 - sllv $tmp3,$in1,$tmp1 - srlv $in1,$in1,$tmp0 - or $in0,$in0,$tmp3 - sllv $tmp3,$in2,$tmp1 - srlv $in2,$in2,$tmp0 - or $in1,$in1,$tmp3 - sllv $tmp3,$in3,$tmp1 - srlv $in3,$in3,$tmp0 - or $in2,$in2,$tmp3 - sllv $tmp2,$tmp2,$tmp1 - or $in3,$in3,$tmp2 -# endif -.Laligned_key: -#else - lwl $in0,0+MSB($inp) - lwl $in1,4+MSB($inp) - lwl $in2,8+MSB($inp) - lwl $in3,12+MSB($inp) - lwr $in0,0+LSB($inp) - lwr $in1,4+LSB($inp) - lwr $in2,8+LSB($inp) - lwr $in3,12+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS32R2) - wsbh $in0,$in0 # byte swap - wsbh $in1,$in1 - wsbh $in2,$in2 - wsbh $in3,$in3 - rotr $in0,$in0,16 - rotr $in1,$in1,16 - rotr $in2,$in2,16 - rotr $in3,$in3,16 -# else - srl $tmp0,$in0,24 # byte swap - srl $tmp1,$in0,8 - andi $tmp2,$in0,0xFF00 - sll $in0,$in0,24 - andi $tmp1,0xFF00 - sll $tmp2,$tmp2,8 - or $in0,$tmp0 - srl $tmp0,$in1,24 - or $tmp1,$tmp2 - srl $tmp2,$in1,8 - or $in0,$tmp1 - andi $tmp1,$in1,0xFF00 - sll $in1,$in1,24 - andi $tmp2,0xFF00 - sll $tmp1,$tmp1,8 - or $in1,$tmp0 - srl $tmp0,$in2,24 - or $tmp2,$tmp1 - srl $tmp1,$in2,8 - or $in1,$tmp2 - andi $tmp2,$in2,0xFF00 - sll $in2,$in2,24 - andi $tmp1,0xFF00 - sll $tmp2,$tmp2,8 - or $in2,$tmp0 - srl $tmp0,$in3,24 - or $tmp1,$tmp2 - srl $tmp2,$in3,8 - or $in2,$tmp1 - andi $tmp1,$in3,0xFF00 - sll $in3,$in3,24 - andi $tmp2,0xFF00 - sll $tmp1,$tmp1,8 - or $in3,$tmp0 - or $tmp2,$tmp1 - or $in3,$tmp2 -# endif -#endif - lui $tmp0,0x0fff - ori $tmp0,0xffff # 0x0fffffff - and $in0,$in0,$tmp0 - subu $tmp0,3 # 0x0ffffffc - and $in1,$in1,$tmp0 - and $in2,$in2,$tmp0 - and $in3,$in3,$tmp0 - - sw $in0,20($ctx) - sw $in1,24($ctx) - sw $in2,28($ctx) - sw $in3,32($ctx) - - srl $tmp1,$in1,2 - srl $tmp2,$in2,2 - srl $tmp3,$in3,2 - addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) - addu $in2,$in2,$tmp2 - addu $in3,$in3,$tmp3 - sw $in1,36($ctx) - sw $in2,40($ctx) - sw $in3,44($ctx) -.Lno_key: - li $v0,0 - jr $ra -.end poly1305_init -___ -{ -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; - -my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = - ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); -my ($d0,$d1,$d2,$d3) = - ($a4,$a5,$a6,$a7); -my $shr = $t2; # used on R6 -my $one = $t2; # used on R2 - -$code.=<<___; -.globl poly1305_blocks -.align 5 -.ent poly1305_blocks -poly1305_blocks: - .frame $sp,16*4,$ra - .mask $SAVED_REGS_MASK,-4 - .set noreorder - subu $sp, $sp,4*12 - sw $s11,4*11($sp) - sw $s10,4*10($sp) - sw $s9, 4*9($sp) - sw $s8, 4*8($sp) - sw $s7, 4*7($sp) - sw $s6, 4*6($sp) - sw $s5, 4*5($sp) - sw $s4, 4*4($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - sw $s3, 4*3($sp) - sw $s2, 4*2($sp) - sw $s1, 4*1($sp) - sw $s0, 4*0($sp) -___ -$code.=<<___; - .set reorder - - srl $len,4 # number of complete blocks - li $one,1 - beqz $len,.Labort - -#if defined(_MIPS_ARCH_MIPS32R6) - andi $shr,$inp,3 - subu $inp,$inp,$shr # align $inp - sll $shr,$shr,3 # byte to bit offset -#endif - - lw $h0,0($ctx) # load hash value - lw $h1,4($ctx) - lw $h2,8($ctx) - lw $h3,12($ctx) - lw $h4,16($ctx) - - lw $r0,20($ctx) # load key - lw $r1,24($ctx) - lw $r2,28($ctx) - lw $r3,32($ctx) - lw $rs1,36($ctx) - lw $rs2,40($ctx) - lw $rs3,44($ctx) - - sll $len,4 - addu $len,$len,$inp # end of buffer - b .Loop - -.align 4 -.Loop: -#if defined(_MIPS_ARCH_MIPS32R6) - lw $d0,0($inp) # load input - lw $d1,4($inp) - lw $d2,8($inp) - lw $d3,12($inp) - beqz $shr,.Laligned_inp - - lw $t0,16($inp) - subu $t1,$zero,$shr -# ifdef MIPSEB - sllv $d0,$d0,$shr - srlv $at,$d1,$t1 - sllv $d1,$d1,$shr - or $d0,$d0,$at - srlv $at,$d2,$t1 - sllv $d2,$d2,$shr - or $d1,$d1,$at - srlv $at,$d3,$t1 - sllv $d3,$d3,$shr - or $d2,$d2,$at - srlv $t0,$t0,$t1 - or $d3,$d3,$t0 -# else - srlv $d0,$d0,$shr - sllv $at,$d1,$t1 - srlv $d1,$d1,$shr - or $d0,$d0,$at - sllv $at,$d2,$t1 - srlv $d2,$d2,$shr - or $d1,$d1,$at - sllv $at,$d3,$t1 - srlv $d3,$d3,$shr - or $d2,$d2,$at - sllv $t0,$t0,$t1 - or $d3,$d3,$t0 -# endif -.Laligned_inp: -#else - lwl $d0,0+MSB($inp) # load input - lwl $d1,4+MSB($inp) - lwl $d2,8+MSB($inp) - lwl $d3,12+MSB($inp) - lwr $d0,0+LSB($inp) - lwr $d1,4+LSB($inp) - lwr $d2,8+LSB($inp) - lwr $d3,12+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS32R2) - wsbh $d0,$d0 # byte swap - wsbh $d1,$d1 - wsbh $d2,$d2 - wsbh $d3,$d3 - rotr $d0,$d0,16 - rotr $d1,$d1,16 - rotr $d2,$d2,16 - rotr $d3,$d3,16 -# else - srl $at,$d0,24 # byte swap - srl $t0,$d0,8 - andi $t1,$d0,0xFF00 - sll $d0,$d0,24 - andi $t0,0xFF00 - sll $t1,$t1,8 - or $d0,$at - srl $at,$d1,24 - or $t0,$t1 - srl $t1,$d1,8 - or $d0,$t0 - andi $t0,$d1,0xFF00 - sll $d1,$d1,24 - andi $t1,0xFF00 - sll $t0,$t0,8 - or $d1,$at - srl $at,$d2,24 - or $t1,$t0 - srl $t0,$d2,8 - or $d1,$t1 - andi $t1,$d2,0xFF00 - sll $d2,$d2,24 - andi $t0,0xFF00 - sll $t1,$t1,8 - or $d2,$at - srl $at,$d3,24 - or $t0,$t1 - srl $t1,$d3,8 - or $d2,$t0 - andi $t0,$d3,0xFF00 - sll $d3,$d3,24 - andi $t1,0xFF00 - sll $t0,$t0,8 - or $d3,$at - or $t1,$t0 - or $d3,$t1 -# endif -#endif - srl $t0,$h4,2 # modulo-scheduled reduction - andi $h4,$h4,3 - sll $at,$t0,2 - - addu $d0,$d0,$h0 # accumulate input - addu $t0,$t0,$at - sltu $h0,$d0,$h0 - addu $d0,$d0,$t0 # ... and residue - sltu $at,$d0,$t0 - - addu $d1,$d1,$h1 - addu $h0,$h0,$at # carry - sltu $h1,$d1,$h1 - addu $d1,$d1,$h0 - sltu $h0,$d1,$h0 - - addu $d2,$d2,$h2 - addu $h1,$h1,$h0 # carry - sltu $h2,$d2,$h2 - addu $d2,$d2,$h1 - sltu $h1,$d2,$h1 - - addu $d3,$d3,$h3 - addu $h2,$h2,$h1 # carry - sltu $h3,$d3,$h3 - addu $d3,$d3,$h2 - -#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) - multu $r0,$d0 # d0*r0 - sltu $h2,$d3,$h2 - maddu $rs3,$d1 # d1*s3 - addu $h3,$h3,$h2 # carry - maddu $rs2,$d2 # d2*s2 - addu $h4,$h4,$padbit - maddu $rs1,$d3 # d3*s1 - addu $h4,$h4,$h3 - mfhi $at - mflo $h0 - - multu $r1,$d0 # d0*r1 - maddu $r0,$d1 # d1*r0 - maddu $rs3,$d2 # d2*s3 - maddu $rs2,$d3 # d3*s2 - maddu $rs1,$h4 # h4*s1 - maddu $at,$one # hi*1 - mfhi $at - mflo $h1 - - multu $r2,$d0 # d0*r2 - maddu $r1,$d1 # d1*r1 - maddu $r0,$d2 # d2*r0 - maddu $rs3,$d3 # d3*s3 - maddu $rs2,$h4 # h4*s2 - maddu $at,$one # hi*1 - mfhi $at - mflo $h2 - - mul $t0,$r0,$h4 # h4*r0 - - multu $r3,$d0 # d0*r3 - maddu $r2,$d1 # d1*r2 - maddu $r1,$d2 # d2*r1 - maddu $r0,$d3 # d3*r0 - maddu $rs3,$h4 # h4*s3 - maddu $at,$one # hi*1 - mfhi $at - mflo $h3 - - addiu $inp,$inp,16 - - addu $h4,$t0,$at -#else - multu ($r0,$d0) # d0*r0 - mflo ($h0,$r0,$d0) - mfhi ($h1,$r0,$d0) - - sltu $h2,$d3,$h2 - addu $h3,$h3,$h2 # carry - - multu ($rs3,$d1) # d1*s3 - mflo ($at,$rs3,$d1) - mfhi ($t0,$rs3,$d1) - - addu $h4,$h4,$padbit - addiu $inp,$inp,16 - addu $h4,$h4,$h3 - - multu ($rs2,$d2) # d2*s2 - mflo ($a3,$rs2,$d2) - mfhi ($t1,$rs2,$d2) - addu $h0,$h0,$at - addu $h1,$h1,$t0 - multu ($rs1,$d3) # d3*s1 - sltu $at,$h0,$at - addu $h1,$h1,$at - - mflo ($at,$rs1,$d3) - mfhi ($t0,$rs1,$d3) - addu $h0,$h0,$a3 - addu $h1,$h1,$t1 - multu ($r1,$d0) # d0*r1 - sltu $a3,$h0,$a3 - addu $h1,$h1,$a3 - - - mflo ($a3,$r1,$d0) - mfhi ($h2,$r1,$d0) - addu $h0,$h0,$at - addu $h1,$h1,$t0 - multu ($r0,$d1) # d1*r0 - sltu $at,$h0,$at - addu $h1,$h1,$at - - mflo ($at,$r0,$d1) - mfhi ($t0,$r0,$d1) - addu $h1,$h1,$a3 - sltu $a3,$h1,$a3 - multu ($rs3,$d2) # d2*s3 - addu $h2,$h2,$a3 - - mflo ($a3,$rs3,$d2) - mfhi ($t1,$rs3,$d2) - addu $h1,$h1,$at - addu $h2,$h2,$t0 - multu ($rs2,$d3) # d3*s2 - sltu $at,$h1,$at - addu $h2,$h2,$at - - mflo ($at,$rs2,$d3) - mfhi ($t0,$rs2,$d3) - addu $h1,$h1,$a3 - addu $h2,$h2,$t1 - multu ($rs1,$h4) # h4*s1 - sltu $a3,$h1,$a3 - addu $h2,$h2,$a3 - - mflo ($a3,$rs1,$h4) - addu $h1,$h1,$at - addu $h2,$h2,$t0 - multu ($r2,$d0) # d0*r2 - sltu $at,$h1,$at - addu $h2,$h2,$at - - - mflo ($at,$r2,$d0) - mfhi ($h3,$r2,$d0) - addu $h1,$h1,$a3 - sltu $a3,$h1,$a3 - multu ($r1,$d1) # d1*r1 - addu $h2,$h2,$a3 - - mflo ($a3,$r1,$d1) - mfhi ($t1,$r1,$d1) - addu $h2,$h2,$at - sltu $at,$h2,$at - multu ($r0,$d2) # d2*r0 - addu $h3,$h3,$at - - mflo ($at,$r0,$d2) - mfhi ($t0,$r0,$d2) - addu $h2,$h2,$a3 - addu $h3,$h3,$t1 - multu ($rs3,$d3) # d3*s3 - sltu $a3,$h2,$a3 - addu $h3,$h3,$a3 - - mflo ($a3,$rs3,$d3) - mfhi ($t1,$rs3,$d3) - addu $h2,$h2,$at - addu $h3,$h3,$t0 - multu ($rs2,$h4) # h4*s2 - sltu $at,$h2,$at - addu $h3,$h3,$at - - mflo ($at,$rs2,$h4) - addu $h2,$h2,$a3 - addu $h3,$h3,$t1 - multu ($r3,$d0) # d0*r3 - sltu $a3,$h2,$a3 - addu $h3,$h3,$a3 - - - mflo ($a3,$r3,$d0) - mfhi ($t1,$r3,$d0) - addu $h2,$h2,$at - sltu $at,$h2,$at - multu ($r2,$d1) # d1*r2 - addu $h3,$h3,$at - - mflo ($at,$r2,$d1) - mfhi ($t0,$r2,$d1) - addu $h3,$h3,$a3 - sltu $a3,$h3,$a3 - multu ($r0,$d3) # d3*r0 - addu $t1,$t1,$a3 - - mflo ($a3,$r0,$d3) - mfhi ($d3,$r0,$d3) - addu $h3,$h3,$at - addu $t1,$t1,$t0 - multu ($r1,$d2) # d2*r1 - sltu $at,$h3,$at - addu $t1,$t1,$at - - mflo ($at,$r1,$d2) - mfhi ($t0,$r1,$d2) - addu $h3,$h3,$a3 - addu $t1,$t1,$d3 - multu ($rs3,$h4) # h4*s3 - sltu $a3,$h3,$a3 - addu $t1,$t1,$a3 - - mflo ($a3,$rs3,$h4) - addu $h3,$h3,$at - addu $t1,$t1,$t0 - multu ($r0,$h4) # h4*r0 - sltu $at,$h3,$at - addu $t1,$t1,$at - - - mflo ($h4,$r0,$h4) - addu $h3,$h3,$a3 - sltu $a3,$h3,$a3 - addu $t1,$t1,$a3 - addu $h4,$h4,$t1 - - li $padbit,1 # if we loop, padbit is 1 -#endif - bne $inp,$len,.Loop - - sw $h0,0($ctx) # store hash value - sw $h1,4($ctx) - sw $h2,8($ctx) - sw $h3,12($ctx) - sw $h4,16($ctx) - - .set noreorder -.Labort: - lw $s11,4*11($sp) - lw $s10,4*10($sp) - lw $s9, 4*9($sp) - lw $s8, 4*8($sp) - lw $s7, 4*7($sp) - lw $s6, 4*6($sp) - lw $s5, 4*5($sp) - lw $s4, 4*4($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - lw $s3, 4*3($sp) - lw $s2, 4*2($sp) - lw $s1, 4*1($sp) - lw $s0, 4*0($sp) -___ -$code.=<<___; - jr $ra - addu $sp,$sp,4*12 -.end poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); - -$code.=<<___; -.align 5 -.globl poly1305_emit -.ent poly1305_emit -poly1305_emit: - .frame $sp,0,$ra - .set reorder - - lw $tmp4,16($ctx) - lw $tmp0,0($ctx) - lw $tmp1,4($ctx) - lw $tmp2,8($ctx) - lw $tmp3,12($ctx) - - li $in0,-4 # final reduction - srl $ctx,$tmp4,2 - and $in0,$in0,$tmp4 - andi $tmp4,$tmp4,3 - addu $ctx,$ctx,$in0 - - addu $tmp0,$tmp0,$ctx - sltu $ctx,$tmp0,$ctx - addiu $in0,$tmp0,5 # compare to modulus - addu $tmp1,$tmp1,$ctx - sltiu $in1,$in0,5 - sltu $ctx,$tmp1,$ctx - addu $in1,$in1,$tmp1 - addu $tmp2,$tmp2,$ctx - sltu $in2,$in1,$tmp1 - sltu $ctx,$tmp2,$ctx - addu $in2,$in2,$tmp2 - addu $tmp3,$tmp3,$ctx - sltu $in3,$in2,$tmp2 - sltu $ctx,$tmp3,$ctx - addu $in3,$in3,$tmp3 - addu $tmp4,$tmp4,$ctx - sltu $ctx,$in3,$tmp3 - addu $ctx,$tmp4 - - srl $ctx,2 # see if it carried/borrowed - subu $ctx,$zero,$ctx - - xor $in0,$tmp0 - xor $in1,$tmp1 - xor $in2,$tmp2 - xor $in3,$tmp3 - and $in0,$ctx - and $in1,$ctx - and $in2,$ctx - and $in3,$ctx - xor $in0,$tmp0 - xor $in1,$tmp1 - xor $in2,$tmp2 - xor $in3,$tmp3 - - lw $tmp0,0($nonce) # load nonce - lw $tmp1,4($nonce) - lw $tmp2,8($nonce) - lw $tmp3,12($nonce) - - addu $in0,$tmp0 # accumulate nonce - sltu $ctx,$in0,$tmp0 - - addu $in1,$tmp1 - sltu $tmp1,$in1,$tmp1 - addu $in1,$ctx - sltu $ctx,$in1,$ctx - addu $ctx,$tmp1 - - addu $in2,$tmp2 - sltu $tmp2,$in2,$tmp2 - addu $in2,$ctx - sltu $ctx,$in2,$ctx - addu $ctx,$tmp2 - - addu $in3,$tmp3 - addu $in3,$ctx - - srl $tmp0,$in0,8 # write mac value - srl $tmp1,$in0,16 - srl $tmp2,$in0,24 - sb $in0, 0($mac) - sb $tmp0,1($mac) - srl $tmp0,$in1,8 - sb $tmp1,2($mac) - srl $tmp1,$in1,16 - sb $tmp2,3($mac) - srl $tmp2,$in1,24 - sb $in1, 4($mac) - sb $tmp0,5($mac) - srl $tmp0,$in2,8 - sb $tmp1,6($mac) - srl $tmp1,$in2,16 - sb $tmp2,7($mac) - srl $tmp2,$in2,24 - sb $in2, 8($mac) - sb $tmp0,9($mac) - srl $tmp0,$in3,8 - sb $tmp1,10($mac) - srl $tmp1,$in3,16 - sb $tmp2,11($mac) - srl $tmp2,$in3,24 - sb $in3, 12($mac) - sb $tmp0,13($mac) - sb $tmp1,14($mac) - sb $tmp2,15($mac) - - jr $ra -.end poly1305_emit -.rdata -.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" -.align 2 -___ -} -}}} - -$output=pop and open STDOUT,">$output"; -print $code; -close STDOUT; diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index fdeb91bf0032..43c44316fbbd 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -196,7 +196,7 @@ if ARM64 source "lib/crypto/arm64/Kconfig" endif if MIPS -source "arch/mips/lib/crypto/Kconfig" +source "lib/crypto/mips/Kconfig" endif if PPC source "arch/powerpc/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 19e9880c5d5f..f54d2f3edc40 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -109,3 +109,4 @@ libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ obj-$(CONFIG_ARM64) += arm64/ +obj-$(CONFIG_MIPS) += mips/ diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore new file mode 100644 index 000000000000..0d47d4f21c6d --- /dev/null +++ b/lib/crypto/mips/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig new file mode 100644 index 000000000000..0670a170c1be --- /dev/null +++ b/lib/crypto/mips/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_MIPS + tristate + depends on CPU_MIPS32_R2 + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_MIPS + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile new file mode 100644 index 000000000000..804488c7aded --- /dev/null +++ b/lib/crypto/mips/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o +chacha-mips-y := chacha-core.o chacha-glue.o +AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots + +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o +poly1305-mips-y := poly1305-core.o poly1305-glue.o + +perlasm-flavour-$(CONFIG_32BIT) := o32 +perlasm-flavour-$(CONFIG_64BIT) := 64 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) + +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE + $(call if_changed,perlasm) + +targets += poly1305-core.S diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S new file mode 100644 index 000000000000..5755f69cfe00 --- /dev/null +++ b/lib/crypto/mips/chacha-core.S @@ -0,0 +1,497 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2016-2018 René van Dorst . All Rights Reserved. + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 32 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $t8 +#define X9 $t9 +#define X10 $v1 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define STATE $a0 +#define OUT $a1 +#define IN $a2 +#define BYTES $a3 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $s7 + +#define IS_UNALIGNED $s7 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define ROTx rotl +#define ROTR(n) rotr n, 24 +#define CPU_TO_LE32(n) \ + wsbh n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define ROTx rotr +#define CPU_TO_LE32(n) +#define ROTR(n) +#endif + +#define FOR_EACH_WORD(x) \ + x( 0); \ + x( 1); \ + x( 2); \ + x( 3); \ + x( 4); \ + x( 5); \ + x( 6); \ + x( 7); \ + x( 8); \ + x( 9); \ + x(10); \ + x(11); \ + x(12); \ + x(13); \ + x(14); \ + x(15); + +#define FOR_EACH_WORD_REV(x) \ + x(15); \ + x(14); \ + x(13); \ + x(12); \ + x(11); \ + x(10); \ + x( 9); \ + x( 8); \ + x( 7); \ + x( 6); \ + x( 5); \ + x( 4); \ + x( 3); \ + x( 2); \ + x( 1); \ + x( 0); + +#define PLUS_ONE_0 1 +#define PLUS_ONE_1 2 +#define PLUS_ONE_2 3 +#define PLUS_ONE_3 4 +#define PLUS_ONE_4 5 +#define PLUS_ONE_5 6 +#define PLUS_ONE_6 7 +#define PLUS_ONE_7 8 +#define PLUS_ONE_8 9 +#define PLUS_ONE_9 10 +#define PLUS_ONE_10 11 +#define PLUS_ONE_11 12 +#define PLUS_ONE_12 13 +#define PLUS_ONE_13 14 +#define PLUS_ONE_14 15 +#define PLUS_ONE_15 16 +#define PLUS_ONE(x) PLUS_ONE_ ## x +#define _CONCAT3(a,b,c) a ## b ## c +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lwl T1, (x*4)+MSB ## (IN); \ + lwr T1, (x*4)+LSB ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + swl X ## x, (x*4)+MSB ## (OUT); \ + swr X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lw T1, (x*4) ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + sw X ## x, (x*4) ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x) \ +.Lchacha_mips_jmptbl_aligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_aligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define JMPTBL_UNALIGNED(x) \ +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotl X(V), S; \ + rotl X(W), S; \ + rotl X(Y), S; \ + rotl X(Z), S; + +.text +.set reorder +.set noat +.globl chacha_crypt_arch +.ent chacha_crypt_arch +chacha_crypt_arch: + .frame $sp, STACK_SIZE, $ra + + /* Load number of rounds */ + lw $at, 16($sp) + + addiu $sp, -STACK_SIZE + + /* Return bytes = 0. */ + beqz BYTES, .Lchacha_mips_end + + lw NONCE_0, 48(STATE) + + /* Save s0-s7 */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + sw $s6, 24($sp) + sw $s7, 28($sp) + + /* Test IN or OUT is unaligned. + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 + */ + or IS_UNALIGNED, IN, OUT + andi IS_UNALIGNED, 0x3 + + b .Lchacha_rounds_start + +.align 4 +.Loop_chacha_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + +.Lchacha_rounds_start: + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + + move X12, NONCE_0 + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_chacha_xor_rounds: + addiu $at, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $at, .Loop_chacha_xor_rounds + + addiu BYTES, -(CHACHA20_BLOCK_SIZE) + + /* Is data src/dst unaligned? Jump */ + bnez IS_UNALIGNED, .Loop_chacha_unaligned + + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES < 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_aligned + + FOR_EACH_WORD_REV(STORE_ALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + + /* BYTES < 0? Handle last bytes */ + bltz BYTES, .Lchacha_mips_xor_bytes + +.Lchacha_mips_xor_done: + /* Restore used registers */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + +.Lchacha_mips_end: + addiu $sp, STACK_SIZE + jr $ra + +.Lchacha_mips_no_full_block_aligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) + + +.Loop_chacha_unaligned: + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES > 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_unaligned + + FOR_EACH_WORD_REV(STORE_UNALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + + .set noreorder + /* Fall through to byte handling */ + bgez BYTES, .Lchacha_mips_xor_done +.Lchacha_mips_xor_unaligned_0_b: +.Lchacha_mips_xor_aligned_0_b: + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + .set reorder + +.Lchacha_mips_xor_bytes: + addu IN, $at + addu OUT, $at + /* First byte */ + lbu T1, 0(IN) + addiu $at, BYTES, 1 + CPU_TO_LE32(SAVED_X) + ROTR(SAVED_X) + xor T1, SAVED_X + sb T1, 0(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Second byte */ + lbu T1, 1(IN) + addiu $at, BYTES, 2 + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 1(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Third byte */ + lbu T1, 2(IN) + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 2(OUT) + b .Lchacha_mips_xor_done + +.Lchacha_mips_no_full_block_unaligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_UNALIGNED) +.end chacha_crypt_arch +.set at + +/* Input arguments + * STATE $a0 + * OUT $a1 + * NROUND $a2 + */ + +#undef X12 +#undef X13 +#undef X14 +#undef X15 + +#define X12 $a3 +#define X13 $at +#define X14 $v0 +#define X15 STATE + +.set noat +.globl hchacha_block_arch +.ent hchacha_block_arch +hchacha_block_arch: + .frame $sp, STACK_SIZE, $ra + + addiu $sp, -STACK_SIZE + + /* Save X11(s6) */ + sw X11, 0($sp) + + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + lw X12, 48(STATE) + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_hchacha_xor_rounds: + addiu $a2, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $a2, .Loop_hchacha_xor_rounds + + /* Restore used register */ + lw X11, 0($sp) + + sw X0, 0(OUT) + sw X1, 4(OUT) + sw X2, 8(OUT) + sw X3, 12(OUT) + sw X12, 16(OUT) + sw X13, 20(OUT) + sw X14, 24(OUT) + sw X15, 28(OUT) + + addiu $sp, STACK_SIZE + jr $ra +.end hchacha_block_arch +.set at diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c new file mode 100644 index 000000000000..88c097594eb0 --- /dev/null +++ b/lib/crypto/mips/chacha-glue.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (MIPS optimized) + * + * Copyright (C) 2019 Linaro, Ltd. + */ + +#include +#include +#include + +asmlinkage void chacha_crypt_arch(struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +EXPORT_SYMBOL(chacha_crypt_arch); + +asmlinkage void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +EXPORT_SYMBOL(hchacha_block_arch); + +bool chacha_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c new file mode 100644 index 000000000000..764a38a65200 --- /dev/null +++ b/lib/crypto/mips/poly1305-glue.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl new file mode 100644 index 000000000000..399f10c3e385 --- /dev/null +++ b/lib/crypto/mips/poly1305-mips.pl @@ -0,0 +1,1273 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL +# project. +# ==================================================================== + +# Poly1305 hash for MIPS. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 ~5.5/+130% (big-endian) +# Octeon II 2.50/+70% (little-endian) +# +# March 2019 +# +# Add 32-bit code path. +# +# October 2019 +# +# Modulo-scheduling reduction allows to omit dependency chain at the +# end of inner loop and improve performance. Also optimize MIPS32R2 +# code path for MIPS 1004K core. Per René von Dorst's suggestions. +# +# IALU/gcc +# R1x000 ~9.8/? (big-endian) +# Octeon II 3.65/+140% (little-endian) +# MT7621/1004K 4.75/? (little-endian) +# +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# +# +###################################################################### + +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; + +if ($flavour =~ /64|n32/i) {{{ +###################################################################### +# 64-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ + defined(_MIPS_ARCH_MIPS64R6)) \\ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +#endif + +#if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +# define dmultu(rs,rt) dmultu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $tmp0,$inp,7 # $inp % 8 + dsubu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + ld $in0,0($inp) + ld $in1,8($inp) + beqz $tmp0,.Laligned_key + ld $tmp2,16($inp) + + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + dsllv $in0,$in0,$tmp0 + dsrlv $tmp3,$in1,$tmp1 + dsllv $in1,$in1,$tmp0 + dsrlv $tmp2,$tmp2,$tmp1 +# else + dsrlv $in0,$in0,$tmp0 + dsllv $tmp3,$in1,$tmp1 + dsrlv $in1,$in1,$tmp0 + dsllv $tmp2,$tmp2,$tmp1 +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_key: +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 # 0x0000000100000000 + daddiu $tmp0,-63 # 0x00000000ffffffc1 + dsll $tmp0,28 # 0x0ffffffc10000000 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); +my ($shr,$shl) = ($s6,$s7); # used on R6 + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + .frame $sp,8*8,$ra + .mask $SAVED_REGS_MASK|0x000c0000,-8 + dsubu $sp,8*8 + sd $s7,56($sp) + sd $s6,48($sp) +#else + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + dsubu $sp,6*8 +#endif + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $shr,$inp,7 + dsubu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset + subu $shl,$zero,$shr +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + dsll $len,4 + daddu $len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) +# ifdef MIPSEB + dsllv $in0,$in0,$shr + dsrlv $tmp3,$in1,$shl + dsllv $in1,$in1,$shr + dsrlv $tmp2,$tmp2,$shl +# else + dsrlv $in0,$in0,$shr + dsllv $tmp3,$in1,$shl + dsrlv $in1,$in1,$shr + dsllv $tmp2,$tmp2,$shl +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_inp: +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + dsrl $tmp1,$h2,2 # modulo-scheduled reduction + andi $h2,$h2,3 + dsll $tmp0,$tmp1,2 + + daddu $d0,$h0,$in0 # accumulate input + daddu $tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + daddu $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + daddu $d1,$h1,$in1 + daddu $tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + daddu $d1,$tmp0 + + dmultu ($r0,$d0) # h0*r0 + daddu $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + dmultu ($rs1,$d1) # h1*5*r1 + daddu $d2,$tmp1 + daddu $d2,$tmp0 + mflo ($tmp0,$rs1,$d1) + mfhi ($tmp1,$rs1,$d1) + + dmultu ($r1,$d0) # h0*r1 + mflo ($tmp2,$r1,$d0) + mfhi ($h2,$r1,$d0) + daddu $h0,$tmp0 + daddu $h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + dmultu ($r0,$d1) # h1*r0 + daddu $h1,$tmp0 + daddu $h1,$tmp2 + mflo ($tmp0,$r0,$d1) + mfhi ($tmp1,$r0,$d1) + + dmultu ($rs1,$d2) # h2*5*r1 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + mflo ($tmp2,$rs1,$d2) + + dmultu ($r0,$d2) # h2*r0 + daddu $h1,$tmp0 + daddu $h2,$tmp1 + mflo ($tmp3,$r0,$d2) + sltu $tmp0,$h1,$tmp0 + daddu $h2,$tmp0 + + daddu $h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + daddu $h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + ld $s7,56($sp) + ld $s6,48($sp) +#endif + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra +#if defined(_MIPS_ARCH_MIPS64R6) + daddu $sp,8*8 +#else + daddu $sp,6*8 +#endif +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + li $in0,-4 # final reduction + dsrl $in1,$tmp2,2 + and $in0,$tmp2 + andi $tmp2,$tmp2,3 + daddu $in0,$in1 + + daddu $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + daddiu $in0,$tmp0,5 # compare to modulus + daddu $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + daddu $in1,$tmp1,$tmp3 + daddu $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + + xor $in0,$tmp0 + xor $in1,$tmp1 + and $in0,$tmp2 + and $in1,$tmp2 + xor $in0,$tmp0 + xor $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ + defined(_MIPS_ARCH_MIPS32R6)) \\ + && !defined(_MIPS_ARCH_MIPS32R2) +# define _MIPS_ARCH_MIPS32R2 +#endif + +#if defined(_MIPS_ARCH_MIPS32R6) +# define multu(rs,rt) +# define mflo(rd,rs,rt) mulu rd,rs,rt +# define mfhi(rd,rs,rt) muhu rd,rs,rt +#else +# define multu(rs,rt) multu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 3 +#else +# define MSB 3 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $tmp0,$inp,3 # $inp % 4 + subu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + sllv $in0,$in0,$tmp0 + srlv $tmp3,$in1,$tmp1 + sllv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + srlv $tmp3,$in2,$tmp1 + sllv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + srlv $tmp3,$in3,$tmp1 + sllv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + srlv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# else + srlv $in0,$in0,$tmp0 + sllv $tmp3,$in1,$tmp1 + srlv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllv $tmp3,$in2,$tmp1 + srlv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllv $tmp3,$in3,$tmp1 + srlv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# endif +.Laligned_key: +#else + lwl $in0,0+MSB($inp) + lwl $in1,4+MSB($inp) + lwl $in2,8+MSB($inp) + lwl $in3,12+MSB($inp) + lwr $in0,0+LSB($inp) + lwr $in1,4+LSB($inp) + lwr $in2,8+LSB($inp) + lwr $in3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $in0,$in0 # byte swap + wsbh $in1,$in1 + wsbh $in2,$in2 + wsbh $in3,$in3 + rotr $in0,$in0,16 + rotr $in1,$in1,16 + rotr $in2,$in2,16 + rotr $in3,$in3,16 +# else + srl $tmp0,$in0,24 # byte swap + srl $tmp1,$in0,8 + andi $tmp2,$in0,0xFF00 + sll $in0,$in0,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in0,$tmp0 + srl $tmp0,$in1,24 + or $tmp1,$tmp2 + srl $tmp2,$in1,8 + or $in0,$tmp1 + andi $tmp1,$in1,0xFF00 + sll $in1,$in1,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in1,$tmp0 + srl $tmp0,$in2,24 + or $tmp2,$tmp1 + srl $tmp1,$in2,8 + or $in1,$tmp2 + andi $tmp2,$in2,0xFF00 + sll $in2,$in2,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in2,$tmp0 + srl $tmp0,$in3,24 + or $tmp1,$tmp2 + srl $tmp2,$in3,8 + or $in2,$tmp1 + andi $tmp1,$in3,0xFF00 + sll $in3,$in3,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in3,$tmp0 + or $tmp2,$tmp1 + or $in3,$tmp2 +# endif +#endif + lui $tmp0,0x0fff + ori $tmp0,0xffff # 0x0fffffff + and $in0,$in0,$tmp0 + subu $tmp0,3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srl $tmp1,$in1,2 + srl $tmp2,$in2,2 + srl $tmp3,$in3,2 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addu $in2,$in2,$tmp2 + addu $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $v0,0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; + +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $t2; # used on R6 +my $one = $t2; # used on R2 + +$code.=<<___; +.globl poly1305_blocks +.align 5 +.ent poly1305_blocks +poly1305_blocks: + .frame $sp,16*4,$ra + .mask $SAVED_REGS_MASK,-4 + .set noreorder + subu $sp, $sp,4*12 + sw $s11,4*11($sp) + sw $s10,4*10($sp) + sw $s9, 4*9($sp) + sw $s8, 4*8($sp) + sw $s7, 4*7($sp) + sw $s6, 4*6($sp) + sw $s5, 4*5($sp) + sw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sw $s3, 4*3($sp) + sw $s2, 4*2($sp) + sw $s1, 4*1($sp) + sw $s0, 4*0($sp) +___ +$code.=<<___; + .set reorder + + srl $len,4 # number of complete blocks + li $one,1 + beqz $len,.Labort + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $shr,$inp,3 + subu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + sll $len,4 + addu $len,$len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS32R6) + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) + beqz $shr,.Laligned_inp + + lw $t0,16($inp) + subu $t1,$zero,$shr +# ifdef MIPSEB + sllv $d0,$d0,$shr + srlv $at,$d1,$t1 + sllv $d1,$d1,$shr + or $d0,$d0,$at + srlv $at,$d2,$t1 + sllv $d2,$d2,$shr + or $d1,$d1,$at + srlv $at,$d3,$t1 + sllv $d3,$d3,$shr + or $d2,$d2,$at + srlv $t0,$t0,$t1 + or $d3,$d3,$t0 +# else + srlv $d0,$d0,$shr + sllv $at,$d1,$t1 + srlv $d1,$d1,$shr + or $d0,$d0,$at + sllv $at,$d2,$t1 + srlv $d2,$d2,$shr + or $d1,$d1,$at + sllv $at,$d3,$t1 + srlv $d3,$d3,$shr + or $d2,$d2,$at + sllv $t0,$t0,$t1 + or $d3,$d3,$t0 +# endif +.Laligned_inp: +#else + lwl $d0,0+MSB($inp) # load input + lwl $d1,4+MSB($inp) + lwl $d2,8+MSB($inp) + lwl $d3,12+MSB($inp) + lwr $d0,0+LSB($inp) + lwr $d1,4+LSB($inp) + lwr $d2,8+LSB($inp) + lwr $d3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $d0,$d0 # byte swap + wsbh $d1,$d1 + wsbh $d2,$d2 + wsbh $d3,$d3 + rotr $d0,$d0,16 + rotr $d1,$d1,16 + rotr $d2,$d2,16 + rotr $d3,$d3,16 +# else + srl $at,$d0,24 # byte swap + srl $t0,$d0,8 + andi $t1,$d0,0xFF00 + sll $d0,$d0,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d0,$at + srl $at,$d1,24 + or $t0,$t1 + srl $t1,$d1,8 + or $d0,$t0 + andi $t0,$d1,0xFF00 + sll $d1,$d1,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d1,$at + srl $at,$d2,24 + or $t1,$t0 + srl $t0,$d2,8 + or $d1,$t1 + andi $t1,$d2,0xFF00 + sll $d2,$d2,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d2,$at + srl $at,$d3,24 + or $t0,$t1 + srl $t1,$d3,8 + or $d2,$t0 + andi $t0,$d3,0xFF00 + sll $d3,$d3,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d3,$at + or $t1,$t0 + or $d3,$t1 +# endif +#endif + srl $t0,$h4,2 # modulo-scheduled reduction + andi $h4,$h4,3 + sll $at,$t0,2 + + addu $d0,$d0,$h0 # accumulate input + addu $t0,$t0,$at + sltu $h0,$d0,$h0 + addu $d0,$d0,$t0 # ... and residue + sltu $at,$d0,$t0 + + addu $d1,$d1,$h1 + addu $h0,$h0,$at # carry + sltu $h1,$d1,$h1 + addu $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addu $d2,$d2,$h2 + addu $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addu $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addu $d3,$d3,$h3 + addu $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addu $d3,$d3,$h2 + +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) + multu $r0,$d0 # d0*r0 + sltu $h2,$d3,$h2 + maddu $rs3,$d1 # d1*s3 + addu $h3,$h3,$h2 # carry + maddu $rs2,$d2 # d2*s2 + addu $h4,$h4,$padbit + maddu $rs1,$d3 # d3*s1 + addu $h4,$h4,$h3 + mfhi $at + mflo $h0 + + multu $r1,$d0 # d0*r1 + maddu $r0,$d1 # d1*r0 + maddu $rs3,$d2 # d2*s3 + maddu $rs2,$d3 # d3*s2 + maddu $rs1,$h4 # h4*s1 + maddu $at,$one # hi*1 + mfhi $at + mflo $h1 + + multu $r2,$d0 # d0*r2 + maddu $r1,$d1 # d1*r1 + maddu $r0,$d2 # d2*r0 + maddu $rs3,$d3 # d3*s3 + maddu $rs2,$h4 # h4*s2 + maddu $at,$one # hi*1 + mfhi $at + mflo $h2 + + mul $t0,$r0,$h4 # h4*r0 + + multu $r3,$d0 # d0*r3 + maddu $r2,$d1 # d1*r2 + maddu $r1,$d2 # d2*r1 + maddu $r0,$d3 # d3*r0 + maddu $rs3,$h4 # h4*s3 + maddu $at,$one # hi*1 + mfhi $at + mflo $h3 + + addiu $inp,$inp,16 + + addu $h4,$t0,$at +#else + multu ($r0,$d0) # d0*r0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + sltu $h2,$d3,$h2 + addu $h3,$h3,$h2 # carry + + multu ($rs3,$d1) # d1*s3 + mflo ($at,$rs3,$d1) + mfhi ($t0,$rs3,$d1) + + addu $h4,$h4,$padbit + addiu $inp,$inp,16 + addu $h4,$h4,$h3 + + multu ($rs2,$d2) # d2*s2 + mflo ($a3,$rs2,$d2) + mfhi ($t1,$rs2,$d2) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($rs1,$d3) # d3*s1 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$rs1,$d3) + mfhi ($t0,$rs1,$d3) + addu $h0,$h0,$a3 + addu $h1,$h1,$t1 + multu ($r1,$d0) # d0*r1 + sltu $a3,$h0,$a3 + addu $h1,$h1,$a3 + + + mflo ($a3,$r1,$d0) + mfhi ($h2,$r1,$d0) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($r0,$d1) # d1*r0 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$r0,$d1) + mfhi ($t0,$r0,$d1) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($rs3,$d2) # d2*s3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs3,$d2) + mfhi ($t1,$rs3,$d2) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($rs2,$d3) # d3*s2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + mflo ($at,$rs2,$d3) + mfhi ($t0,$rs2,$d3) + addu $h1,$h1,$a3 + addu $h2,$h2,$t1 + multu ($rs1,$h4) # h4*s1 + sltu $a3,$h1,$a3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs1,$h4) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($r2,$d0) # d0*r2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + + mflo ($at,$r2,$d0) + mfhi ($h3,$r2,$d0) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($r1,$d1) # d1*r1 + addu $h2,$h2,$a3 + + mflo ($a3,$r1,$d1) + mfhi ($t1,$r1,$d1) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r0,$d2) # d2*r0 + addu $h3,$h3,$at + + mflo ($at,$r0,$d2) + mfhi ($t0,$r0,$d2) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($rs3,$d3) # d3*s3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + mflo ($a3,$rs3,$d3) + mfhi ($t1,$rs3,$d3) + addu $h2,$h2,$at + addu $h3,$h3,$t0 + multu ($rs2,$h4) # h4*s2 + sltu $at,$h2,$at + addu $h3,$h3,$at + + mflo ($at,$rs2,$h4) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($r3,$d0) # d0*r3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + + mflo ($a3,$r3,$d0) + mfhi ($t1,$r3,$d0) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r2,$d1) # d1*r2 + addu $h3,$h3,$at + + mflo ($at,$r2,$d1) + mfhi ($t0,$r2,$d1) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + multu ($r0,$d3) # d3*r0 + addu $t1,$t1,$a3 + + mflo ($a3,$r0,$d3) + mfhi ($d3,$r0,$d3) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r1,$d2) # d2*r1 + sltu $at,$h3,$at + addu $t1,$t1,$at + + mflo ($at,$r1,$d2) + mfhi ($t0,$r1,$d2) + addu $h3,$h3,$a3 + addu $t1,$t1,$d3 + multu ($rs3,$h4) # h4*s3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + + mflo ($a3,$rs3,$h4) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r0,$h4) # h4*r0 + sltu $at,$h3,$at + addu $t1,$t1,$at + + + mflo ($h4,$r0,$h4) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + addu $h4,$h4,$t1 + + li $padbit,1 # if we loop, padbit is 1 +#endif + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + + .set noreorder +.Labort: + lw $s11,4*11($sp) + lw $s10,4*10($sp) + lw $s9, 4*9($sp) + lw $s8, 4*8($sp) + lw $s7, 4*7($sp) + lw $s6, 4*6($sp) + lw $s5, 4*5($sp) + lw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + lw $s3, 4*3($sp) + lw $s2, 4*2($sp) + lw $s1, 4*1($sp) + lw $s0, 4*0($sp) +___ +$code.=<<___; + jr $ra + addu $sp,$sp,4*12 +.end poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + li $in0,-4 # final reduction + srl $ctx,$tmp4,2 + and $in0,$in0,$tmp4 + andi $tmp4,$tmp4,3 + addu $ctx,$ctx,$in0 + + addu $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiu $in0,$tmp0,5 # compare to modulus + addu $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addu $in1,$in1,$tmp1 + addu $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addu $in2,$in2,$tmp2 + addu $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addu $in3,$in3,$tmp3 + addu $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addu $ctx,$tmp4 + + srl $ctx,2 # see if it carried/borrowed + subu $ctx,$zero,$ctx + + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + and $in0,$ctx + and $in1,$ctx + and $in2,$ctx + and $in3,$ctx + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addu $in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addu $in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addu $in1,$ctx + sltu $ctx,$in1,$ctx + addu $ctx,$tmp1 + + addu $in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addu $in2,$ctx + sltu $ctx,$in2,$ctx + addu $ctx,$tmp2 + + addu $in3,$tmp3 + addu $in3,$ctx + + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; -- cgit v1.2.3 From 676d45aba8c498dc14ffd27338dba8d4395787e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:03 -0700 Subject: lib/crypto: powerpc: Move arch/powerpc/lib/crypto/ into lib/crypto/ Move the contents of arch/powerpc/lib/crypto/ into lib/crypto/powerpc/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/lib/Makefile | 2 - arch/powerpc/lib/crypto/Kconfig | 22 - arch/powerpc/lib/crypto/Makefile | 10 - arch/powerpc/lib/crypto/chacha-p10-glue.c | 100 --- arch/powerpc/lib/crypto/chacha-p10le-8x.S | 840 --------------------- arch/powerpc/lib/crypto/poly1305-p10-glue.c | 96 --- arch/powerpc/lib/crypto/poly1305-p10le_64.S | 1075 --------------------------- arch/powerpc/lib/crypto/sha256-spe-asm.S | 318 -------- arch/powerpc/lib/crypto/sha256.c | 70 -- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/powerpc/Kconfig | 22 + lib/crypto/powerpc/Makefile | 10 + lib/crypto/powerpc/chacha-p10-glue.c | 100 +++ lib/crypto/powerpc/chacha-p10le-8x.S | 840 +++++++++++++++++++++ lib/crypto/powerpc/poly1305-p10-glue.c | 96 +++ lib/crypto/powerpc/poly1305-p10le_64.S | 1075 +++++++++++++++++++++++++++ lib/crypto/powerpc/sha256-spe-asm.S | 318 ++++++++ lib/crypto/powerpc/sha256.c | 70 ++ 19 files changed, 2533 insertions(+), 2534 deletions(-) delete mode 100644 arch/powerpc/lib/crypto/Kconfig delete mode 100644 arch/powerpc/lib/crypto/Makefile delete mode 100644 arch/powerpc/lib/crypto/chacha-p10-glue.c delete mode 100644 arch/powerpc/lib/crypto/chacha-p10le-8x.S delete mode 100644 arch/powerpc/lib/crypto/poly1305-p10-glue.c delete mode 100644 arch/powerpc/lib/crypto/poly1305-p10le_64.S delete mode 100644 arch/powerpc/lib/crypto/sha256-spe-asm.S delete mode 100644 arch/powerpc/lib/crypto/sha256.c create mode 100644 lib/crypto/powerpc/Kconfig create mode 100644 lib/crypto/powerpc/Makefile create mode 100644 lib/crypto/powerpc/chacha-p10-glue.c create mode 100644 lib/crypto/powerpc/chacha-p10le-8x.S create mode 100644 lib/crypto/powerpc/poly1305-p10-glue.c create mode 100644 lib/crypto/powerpc/poly1305-p10le_64.S create mode 100644 lib/crypto/powerpc/sha256-spe-asm.S create mode 100644 lib/crypto/powerpc/sha256.c diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 481f968e42c7..27f8a0143860 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for ppc-specific library files.. # -obj-y += crypto/ - CFLAGS_code-patching.o += -fno-stack-protector CFLAGS_feature-fixups.o += -fno-stack-protector diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig deleted file mode 100644 index 3f9e1bbd9905..000000000000 --- a/arch/powerpc/lib/crypto/Kconfig +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - depends on BROKEN # Needs to be fixed to work in softirq context - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - select CRYPTO_LIB_POLY1305_GENERIC - -config CRYPTO_SHA256_PPC_SPE - tristate - depends on SPE - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile deleted file mode 100644 index 27f231f8e334..000000000000 --- a/arch/powerpc/lib/crypto/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o -chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o - -obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o -poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o - -obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o -sha256-ppc-spe-y := sha256.o sha256-spe-asm.o diff --git a/arch/powerpc/lib/crypto/chacha-p10-glue.c b/arch/powerpc/lib/crypto/chacha-p10-glue.c deleted file mode 100644 index fcd23c6f1590..000000000000 --- a/arch/powerpc/lib/crypto/chacha-p10-glue.c +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha stream cipher (P10 accelerated) - * - * Copyright 2023- IBM Corp. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, - const u8 *src, unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - unsigned int l = bytes & ~0x0FF; - - if (l > 0) { - chacha_p10le_8x(state, dst, src, l, nrounds); - bytes -= l; - src += l; - dst += l; - state->x[12] += l / CHACHA_BLOCK_SIZE; - } - - if (bytes > 0) - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - vsx_begin(); - chacha_p10_do_8x(state, dst, src, todo, nrounds); - vsx_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_p10_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - static_branch_enable(&have_p10); - return 0; -} -subsys_initcall(chacha_p10_init); - -static void __exit chacha_p10_exit(void) -{ -} -module_exit(chacha_p10_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/powerpc/lib/crypto/chacha-p10le-8x.S b/arch/powerpc/lib/crypto/chacha-p10le-8x.S deleted file mode 100644 index b29562bd5d40..000000000000 --- a/arch/powerpc/lib/crypto/chacha-p10le-8x.S +++ /dev/null @@ -1,840 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# Accelerated chacha20 implementation for ppc64le. -# -# Copyright 2023- IBM Corp. All rights reserved -# -#=================================================================================== -# Written by Danny Tsen -# -# do rounds, 8 quarter rounds -# 1. a += b; d ^= a; d <<<= 16; -# 2. c += d; b ^= c; b <<<= 12; -# 3. a += b; d ^= a; d <<<= 8; -# 4. c += d; b ^= c; b <<<= 7 -# -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 -# -# 4 blocks (a b c d) -# -# a0 b0 c0 d0 -# a1 b1 c1 d1 -# ... -# a4 b4 c4 d4 -# ... -# a8 b8 c8 d8 -# ... -# a12 b12 c12 d12 -# a13 ... -# a14 ... -# a15 b15 c15 d15 -# -# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) -# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) -# - -#include -#include -#include -#include - -.machine "any" -.text - -.macro SAVE_GPR GPR OFFSET FRAME - std \GPR,\OFFSET(\FRAME) -.endm - -.macro SAVE_VRS VRS OFFSET FRAME - li 16, \OFFSET - stvx \VRS, 16, \FRAME -.endm - -.macro SAVE_VSX VSX OFFSET FRAME - li 16, \OFFSET - stxvx \VSX, 16, \FRAME -.endm - -.macro RESTORE_GPR GPR OFFSET FRAME - ld \GPR,\OFFSET(\FRAME) -.endm - -.macro RESTORE_VRS VRS OFFSET FRAME - li 16, \OFFSET - lvx \VRS, 16, \FRAME -.endm - -.macro RESTORE_VSX VSX OFFSET FRAME - li 16, \OFFSET - lxvx \VSX, 16, \FRAME -.endm - -.macro SAVE_REGS - mflr 0 - std 0, 16(1) - stdu 1,-752(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - addi 9, 1, 256 - SAVE_VRS 20, 0, 9 - SAVE_VRS 21, 16, 9 - SAVE_VRS 22, 32, 9 - SAVE_VRS 23, 48, 9 - SAVE_VRS 24, 64, 9 - SAVE_VRS 25, 80, 9 - SAVE_VRS 26, 96, 9 - SAVE_VRS 27, 112, 9 - SAVE_VRS 28, 128, 9 - SAVE_VRS 29, 144, 9 - SAVE_VRS 30, 160, 9 - SAVE_VRS 31, 176, 9 - - SAVE_VSX 14, 192, 9 - SAVE_VSX 15, 208, 9 - SAVE_VSX 16, 224, 9 - SAVE_VSX 17, 240, 9 - SAVE_VSX 18, 256, 9 - SAVE_VSX 19, 272, 9 - SAVE_VSX 20, 288, 9 - SAVE_VSX 21, 304, 9 - SAVE_VSX 22, 320, 9 - SAVE_VSX 23, 336, 9 - SAVE_VSX 24, 352, 9 - SAVE_VSX 25, 368, 9 - SAVE_VSX 26, 384, 9 - SAVE_VSX 27, 400, 9 - SAVE_VSX 28, 416, 9 - SAVE_VSX 29, 432, 9 - SAVE_VSX 30, 448, 9 - SAVE_VSX 31, 464, 9 -.endm # SAVE_REGS - -.macro RESTORE_REGS - addi 9, 1, 256 - RESTORE_VRS 20, 0, 9 - RESTORE_VRS 21, 16, 9 - RESTORE_VRS 22, 32, 9 - RESTORE_VRS 23, 48, 9 - RESTORE_VRS 24, 64, 9 - RESTORE_VRS 25, 80, 9 - RESTORE_VRS 26, 96, 9 - RESTORE_VRS 27, 112, 9 - RESTORE_VRS 28, 128, 9 - RESTORE_VRS 29, 144, 9 - RESTORE_VRS 30, 160, 9 - RESTORE_VRS 31, 176, 9 - - RESTORE_VSX 14, 192, 9 - RESTORE_VSX 15, 208, 9 - RESTORE_VSX 16, 224, 9 - RESTORE_VSX 17, 240, 9 - RESTORE_VSX 18, 256, 9 - RESTORE_VSX 19, 272, 9 - RESTORE_VSX 20, 288, 9 - RESTORE_VSX 21, 304, 9 - RESTORE_VSX 22, 320, 9 - RESTORE_VSX 23, 336, 9 - RESTORE_VSX 24, 352, 9 - RESTORE_VSX 25, 368, 9 - RESTORE_VSX 26, 384, 9 - RESTORE_VSX 27, 400, 9 - RESTORE_VSX 28, 416, 9 - RESTORE_VSX 29, 432, 9 - RESTORE_VSX 30, 448, 9 - RESTORE_VSX 31, 464, 9 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 752 - ld 0, 16(1) - mtlr 0 -.endm # RESTORE_REGS - -.macro QT_loop_8x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 4, 4, 25 # - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 20, 20, 25 # - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - xxlor 32+25, 0, 0 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - vrlw 4, 4, 28 # - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 20, 20, 28 # - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - xxlor 32+28, 0, 0 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - - xxlor 32+25, 0, 0 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 4, 4, 25 - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - vrlw 20, 20, 25 - xxlor 32+25, 0, 0 - - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - xxlor 32+25, 0, 0 - - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 4, 4, 28 - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - vrlw 20, 20, 28 - xxlor 32+28, 0, 0 -.endm - -.macro QT_loop_4x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 20 - vpermxor 13, 13, 1, 20 - vpermxor 14, 14, 2, 20 - vpermxor 15, 15, 3, 20 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 21 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 22 - vpermxor 13, 13, 1, 22 - vpermxor 14, 14, 2, 22 - vpermxor 15, 15, 3, 22 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 23 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 20 - vpermxor 12, 12, 1, 20 - vpermxor 13, 13, 2, 20 - vpermxor 14, 14, 3, 20 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vrlw 4, 4, 21 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 22 - vpermxor 12, 12, 1, 22 - vpermxor 13, 13, 2, 22 - vpermxor 14, 14, 3, 22 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - vrlw 4, 4, 23 -.endm - -# Transpose -.macro TP_4x a0 a1 a2 a3 - xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 - xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 - xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 - xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 - xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 - xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 - xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 - xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 -.endm - -# key stream = working state + state -.macro Add_state S - vadduwm \S+0, \S+0, 16-\S - vadduwm \S+4, \S+4, 17-\S - vadduwm \S+8, \S+8, 18-\S - vadduwm \S+12, \S+12, 19-\S - - vadduwm \S+1, \S+1, 16-\S - vadduwm \S+5, \S+5, 17-\S - vadduwm \S+9, \S+9, 18-\S - vadduwm \S+13, \S+13, 19-\S - - vadduwm \S+2, \S+2, 16-\S - vadduwm \S+6, \S+6, 17-\S - vadduwm \S+10, \S+10, 18-\S - vadduwm \S+14, \S+14, 19-\S - - vadduwm \S+3, \S+3, 16-\S - vadduwm \S+7, \S+7, 17-\S - vadduwm \S+11, \S+11, 18-\S - vadduwm \S+15, \S+15, 19-\S -.endm - -# -# write 256 bytes -# -.macro Write_256 S - add 9, 14, 5 - add 16, 14, 4 - lxvw4x 0, 0, 9 - lxvw4x 1, 17, 9 - lxvw4x 2, 18, 9 - lxvw4x 3, 19, 9 - lxvw4x 4, 20, 9 - lxvw4x 5, 21, 9 - lxvw4x 6, 22, 9 - lxvw4x 7, 23, 9 - lxvw4x 8, 24, 9 - lxvw4x 9, 25, 9 - lxvw4x 10, 26, 9 - lxvw4x 11, 27, 9 - lxvw4x 12, 28, 9 - lxvw4x 13, 29, 9 - lxvw4x 14, 30, 9 - lxvw4x 15, 31, 9 - - xxlxor \S+32, \S+32, 0 - xxlxor \S+36, \S+36, 1 - xxlxor \S+40, \S+40, 2 - xxlxor \S+44, \S+44, 3 - xxlxor \S+33, \S+33, 4 - xxlxor \S+37, \S+37, 5 - xxlxor \S+41, \S+41, 6 - xxlxor \S+45, \S+45, 7 - xxlxor \S+34, \S+34, 8 - xxlxor \S+38, \S+38, 9 - xxlxor \S+42, \S+42, 10 - xxlxor \S+46, \S+46, 11 - xxlxor \S+35, \S+35, 12 - xxlxor \S+39, \S+39, 13 - xxlxor \S+43, \S+43, 14 - xxlxor \S+47, \S+47, 15 - - stxvw4x \S+32, 0, 16 - stxvw4x \S+36, 17, 16 - stxvw4x \S+40, 18, 16 - stxvw4x \S+44, 19, 16 - - stxvw4x \S+33, 20, 16 - stxvw4x \S+37, 21, 16 - stxvw4x \S+41, 22, 16 - stxvw4x \S+45, 23, 16 - - stxvw4x \S+34, 24, 16 - stxvw4x \S+38, 25, 16 - stxvw4x \S+42, 26, 16 - stxvw4x \S+46, 27, 16 - - stxvw4x \S+35, 28, 16 - stxvw4x \S+39, 29, 16 - stxvw4x \S+43, 30, 16 - stxvw4x \S+47, 31, 16 - -.endm - -# -# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src, -# unsigned int len, int nrounds); -# -SYM_FUNC_START(chacha_p10le_8x) -.align 5 - cmpdi 6, 0 - ble Out_no_chacha - - SAVE_REGS - - # r17 - r31 mainly for Write_256 macro. - li 17, 16 - li 18, 32 - li 19, 48 - li 20, 64 - li 21, 80 - li 22, 96 - li 23, 112 - li 24, 128 - li 25, 144 - li 26, 160 - li 27, 176 - li 28, 192 - li 29, 208 - li 30, 224 - li 31, 240 - - mr 15, 6 # len - li 14, 0 # offset to inp and outp - - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - # create (0, 1, 2, 3) counters - vspltisw 0, 0 - vspltisw 1, 1 - vspltisw 2, 2 - vspltisw 3, 3 - vmrghw 4, 0, 1 - vmrglw 5, 2, 3 - vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) - - vspltisw 21, 12 - vspltisw 23, 7 - - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - - mtctr 8 - - # save constants to vsx - xxlor 16, 48, 48 - xxlor 17, 49, 49 - xxlor 18, 50, 50 - xxlor 19, 51, 51 - - vspltisw 25, 4 - vspltisw 26, 8 - - xxlor 25, 32+26, 32+26 - xxlor 24, 32+25, 32+25 - - vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - xxlor 20, 32+20, 32+20 - xxlor 21, 32+21, 32+21 - xxlor 22, 32+22, 32+22 - xxlor 23, 32+23, 32+23 - - cmpdi 6, 512 - blt Loop_last - -Loop_8x: - xxspltw 32+0, 16, 0 - xxspltw 32+1, 16, 1 - xxspltw 32+2, 16, 2 - xxspltw 32+3, 16, 3 - - xxspltw 32+4, 17, 0 - xxspltw 32+5, 17, 1 - xxspltw 32+6, 17, 2 - xxspltw 32+7, 17, 3 - xxspltw 32+8, 18, 0 - xxspltw 32+9, 18, 1 - xxspltw 32+10, 18, 2 - xxspltw 32+11, 18, 3 - xxspltw 32+12, 19, 0 - xxspltw 32+13, 19, 1 - xxspltw 32+14, 19, 2 - xxspltw 32+15, 19, 3 - vadduwm 12, 12, 30 # increase counter - - xxspltw 32+16, 16, 0 - xxspltw 32+17, 16, 1 - xxspltw 32+18, 16, 2 - xxspltw 32+19, 16, 3 - - xxspltw 32+20, 17, 0 - xxspltw 32+21, 17, 1 - xxspltw 32+22, 17, 2 - xxspltw 32+23, 17, 3 - xxspltw 32+24, 18, 0 - xxspltw 32+25, 18, 1 - xxspltw 32+26, 18, 2 - xxspltw 32+27, 18, 3 - xxspltw 32+28, 19, 0 - xxspltw 32+29, 19, 1 - vadduwm 28, 28, 31 # increase counter - xxspltw 32+30, 19, 2 - xxspltw 32+31, 19, 3 - -.align 5 -quarter_loop_8x: - QT_loop_8x - - bdnz quarter_loop_8x - - xxlor 0, 32+30, 32+30 - xxlor 32+30, 30, 30 - vadduwm 12, 12, 30 - xxlor 32+30, 0, 0 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - xxlor 0, 48, 48 - xxlor 1, 49, 49 - xxlor 2, 50, 50 - xxlor 3, 51, 51 - xxlor 48, 16, 16 - xxlor 49, 17, 17 - xxlor 50, 18, 18 - xxlor 51, 19, 19 - Add_state 0 - xxlor 48, 0, 0 - xxlor 49, 1, 1 - xxlor 50, 2, 2 - xxlor 51, 3, 3 - Write_256 0 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len -=256 - - xxlor 5, 32+31, 32+31 - xxlor 32+31, 31, 31 - vadduwm 28, 28, 31 - xxlor 32+31, 5, 5 - TP_4x 16+0, 16+1, 16+2, 16+3 - TP_4x 16+4, 16+5, 16+6, 16+7 - TP_4x 16+8, 16+9, 16+10, 16+11 - TP_4x 16+12, 16+13, 16+14, 16+15 - - xxlor 32, 16, 16 - xxlor 33, 17, 17 - xxlor 34, 18, 18 - xxlor 35, 19, 19 - Add_state 16 - Write_256 16 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len +=256 - - xxlor 32+24, 24, 24 - xxlor 32+25, 25, 25 - xxlor 32+30, 30, 30 - vadduwm 30, 30, 25 - vadduwm 31, 30, 24 - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - cmpdi 15, 0 - beq Out_loop - - cmpdi 15, 512 - blt Loop_last - - mtctr 8 - b Loop_8x - -Loop_last: - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - vspltisw 21, 12 - vspltisw 23, 7 - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - mtctr 8 - -Loop_4x: - vspltw 0, 16, 0 - vspltw 1, 16, 1 - vspltw 2, 16, 2 - vspltw 3, 16, 3 - - vspltw 4, 17, 0 - vspltw 5, 17, 1 - vspltw 6, 17, 2 - vspltw 7, 17, 3 - vspltw 8, 18, 0 - vspltw 9, 18, 1 - vspltw 10, 18, 2 - vspltw 11, 18, 3 - vspltw 12, 19, 0 - vadduwm 12, 12, 30 # increase counter - vspltw 13, 19, 1 - vspltw 14, 19, 2 - vspltw 15, 19, 3 - -.align 5 -quarter_loop: - QT_loop_4x - - bdnz quarter_loop - - vadduwm 12, 12, 30 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - Add_state 0 - Write_256 0 - addi 14, 14, 256 # offset += 256 - addi 15, 15, -256 # len += 256 - - # Update state counter - vspltisw 25, 4 - vadduwm 30, 30, 25 - - cmpdi 15, 0 - beq Out_loop - cmpdi 15, 256 - blt Out_loop - - mtctr 8 - b Loop_4x - -Out_loop: - RESTORE_REGS - blr - -Out_no_chacha: - li 3, 0 - blr -SYM_FUNC_END(chacha_p10le_8x) - -SYM_DATA_START_LOCAL(PERMX) -.align 5 -permx: -.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd -.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc -SYM_DATA_END(PERMX) diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c deleted file mode 100644 index 3f1664a724b6..000000000000 --- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Poly1305 authenticator algorithm, RFC7539. - * - * Copyright 2023- IBM Corp. All rights reserved. - */ -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); -asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); -asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -void poly1305_block_init_arch(struct poly1305_block_state *dctx, - const u8 raw_key[POLY1305_BLOCK_SIZE]) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_block_init_generic(dctx, raw_key); - - dctx->h = (struct poly1305_state){}; - dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); - dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); -} -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_blocks_generic(state, src, len, padbit); - vsx_begin(); - if (len >= POLY1305_BLOCK_SIZE * 4) { - poly1305_p10le_4blocks(state, src, len); - src += len - (len % (POLY1305_BLOCK_SIZE * 4)); - len %= POLY1305_BLOCK_SIZE * 4; - } - while (len >= POLY1305_BLOCK_SIZE) { - poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); - len -= POLY1305_BLOCK_SIZE; - src += POLY1305_BLOCK_SIZE; - } - vsx_end(); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_emit_generic(state, digest, nonce); - poly1305_emit_64(state, nonce, digest); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init poly1305_p10_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - static_branch_enable(&have_p10); - return 0; -} -subsys_initcall(poly1305_p10_init); - -static void __exit poly1305_p10_exit(void) -{ -} -module_exit(poly1305_p10_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S deleted file mode 100644 index a3c1987f1ecd..000000000000 --- a/arch/powerpc/lib/crypto/poly1305-p10le_64.S +++ /dev/null @@ -1,1075 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# Accelerated poly1305 implementation for ppc64le. -# -# Copyright 2023- IBM Corp. All rights reserved -# -#=================================================================================== -# Written by Danny Tsen -# -# Poly1305 - this version mainly using vector/VSX/Scalar -# - 26 bits limbs -# - Handle multiple 64 byte blcok. -# -# Block size 16 bytes -# key = (r, s) -# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF -# p = 2^130 - 5 -# a += m -# a = (r + a) % p -# a += s -# -# Improve performance by breaking down polynominal to the sum of products with -# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r -# -# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 -# to 9 vectors for multiplications. -# -# setup r^4, r^3, r^2, r vectors -# vs [r^1, r^3, r^2, r^4] -# vs0 = [r0,.....] -# vs1 = [r1,.....] -# vs2 = [r2,.....] -# vs3 = [r3,.....] -# vs4 = [r4,.....] -# vs5 = [r1*5,...] -# vs6 = [r2*5,...] -# vs7 = [r2*5,...] -# vs8 = [r4*5,...] -# -# Each word in a vector consists a member of a "r/s" in [a * r/s]. -# -# r0, r4*5, r3*5, r2*5, r1*5; -# r1, r0, r4*5, r3*5, r2*5; -# r2, r1, r0, r4*5, r3*5; -# r3, r2, r1, r0, r4*5; -# r4, r3, r2, r1, r0 ; -# -# -# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) -# k = 32 bytes key -# r3 = k (r, s) -# r4 = mlen -# r5 = m -# -#include -#include -#include -#include - -.machine "any" - -.text - -.macro SAVE_GPR GPR OFFSET FRAME - std \GPR,\OFFSET(\FRAME) -.endm - -.macro SAVE_VRS VRS OFFSET FRAME - li 16, \OFFSET - stvx \VRS, 16, \FRAME -.endm - -.macro SAVE_VSX VSX OFFSET FRAME - li 16, \OFFSET - stxvx \VSX, 16, \FRAME -.endm - -.macro RESTORE_GPR GPR OFFSET FRAME - ld \GPR,\OFFSET(\FRAME) -.endm - -.macro RESTORE_VRS VRS OFFSET FRAME - li 16, \OFFSET - lvx \VRS, 16, \FRAME -.endm - -.macro RESTORE_VSX VSX OFFSET FRAME - li 16, \OFFSET - lxvx \VSX, 16, \FRAME -.endm - -.macro SAVE_REGS - mflr 0 - std 0, 16(1) - stdu 1,-752(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - addi 9, 1, 256 - SAVE_VRS 20, 0, 9 - SAVE_VRS 21, 16, 9 - SAVE_VRS 22, 32, 9 - SAVE_VRS 23, 48, 9 - SAVE_VRS 24, 64, 9 - SAVE_VRS 25, 80, 9 - SAVE_VRS 26, 96, 9 - SAVE_VRS 27, 112, 9 - SAVE_VRS 28, 128, 9 - SAVE_VRS 29, 144, 9 - SAVE_VRS 30, 160, 9 - SAVE_VRS 31, 176, 9 - - SAVE_VSX 14, 192, 9 - SAVE_VSX 15, 208, 9 - SAVE_VSX 16, 224, 9 - SAVE_VSX 17, 240, 9 - SAVE_VSX 18, 256, 9 - SAVE_VSX 19, 272, 9 - SAVE_VSX 20, 288, 9 - SAVE_VSX 21, 304, 9 - SAVE_VSX 22, 320, 9 - SAVE_VSX 23, 336, 9 - SAVE_VSX 24, 352, 9 - SAVE_VSX 25, 368, 9 - SAVE_VSX 26, 384, 9 - SAVE_VSX 27, 400, 9 - SAVE_VSX 28, 416, 9 - SAVE_VSX 29, 432, 9 - SAVE_VSX 30, 448, 9 - SAVE_VSX 31, 464, 9 -.endm # SAVE_REGS - -.macro RESTORE_REGS - addi 9, 1, 256 - RESTORE_VRS 20, 0, 9 - RESTORE_VRS 21, 16, 9 - RESTORE_VRS 22, 32, 9 - RESTORE_VRS 23, 48, 9 - RESTORE_VRS 24, 64, 9 - RESTORE_VRS 25, 80, 9 - RESTORE_VRS 26, 96, 9 - RESTORE_VRS 27, 112, 9 - RESTORE_VRS 28, 128, 9 - RESTORE_VRS 29, 144, 9 - RESTORE_VRS 30, 160, 9 - RESTORE_VRS 31, 176, 9 - - RESTORE_VSX 14, 192, 9 - RESTORE_VSX 15, 208, 9 - RESTORE_VSX 16, 224, 9 - RESTORE_VSX 17, 240, 9 - RESTORE_VSX 18, 256, 9 - RESTORE_VSX 19, 272, 9 - RESTORE_VSX 20, 288, 9 - RESTORE_VSX 21, 304, 9 - RESTORE_VSX 22, 320, 9 - RESTORE_VSX 23, 336, 9 - RESTORE_VSX 24, 352, 9 - RESTORE_VSX 25, 368, 9 - RESTORE_VSX 26, 384, 9 - RESTORE_VSX 27, 400, 9 - RESTORE_VSX 28, 416, 9 - RESTORE_VSX 29, 432, 9 - RESTORE_VSX 30, 448, 9 - RESTORE_VSX 31, 464, 9 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 752 - ld 0, 16(1) - mtlr 0 -.endm # RESTORE_REGS - -# -# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; -# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; -# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; -# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; -# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; -# -# [r^2, r^3, r^1, r^4] -# [m3, m2, m4, m1] -# -# multiply odd and even words -.macro mul_odd - vmulouw 14, 4, 26 - vmulouw 10, 5, 3 - vmulouw 11, 6, 2 - vmulouw 12, 7, 1 - vmulouw 13, 8, 0 - vmulouw 15, 4, 27 - vaddudm 14, 14, 10 - vaddudm 14, 14, 11 - vmulouw 10, 5, 26 - vmulouw 11, 6, 3 - vaddudm 14, 14, 12 - vaddudm 14, 14, 13 # x0 - vaddudm 15, 15, 10 - vaddudm 15, 15, 11 - vmulouw 12, 7, 2 - vmulouw 13, 8, 1 - vaddudm 15, 15, 12 - vaddudm 15, 15, 13 # x1 - vmulouw 16, 4, 28 - vmulouw 10, 5, 27 - vmulouw 11, 6, 26 - vaddudm 16, 16, 10 - vaddudm 16, 16, 11 - vmulouw 12, 7, 3 - vmulouw 13, 8, 2 - vaddudm 16, 16, 12 - vaddudm 16, 16, 13 # x2 - vmulouw 17, 4, 29 - vmulouw 10, 5, 28 - vmulouw 11, 6, 27 - vaddudm 17, 17, 10 - vaddudm 17, 17, 11 - vmulouw 12, 7, 26 - vmulouw 13, 8, 3 - vaddudm 17, 17, 12 - vaddudm 17, 17, 13 # x3 - vmulouw 18, 4, 30 - vmulouw 10, 5, 29 - vmulouw 11, 6, 28 - vaddudm 18, 18, 10 - vaddudm 18, 18, 11 - vmulouw 12, 7, 27 - vmulouw 13, 8, 26 - vaddudm 18, 18, 12 - vaddudm 18, 18, 13 # x4 -.endm - -.macro mul_even - vmuleuw 9, 4, 26 - vmuleuw 10, 5, 3 - vmuleuw 11, 6, 2 - vmuleuw 12, 7, 1 - vmuleuw 13, 8, 0 - vaddudm 14, 14, 9 - vaddudm 14, 14, 10 - vaddudm 14, 14, 11 - vaddudm 14, 14, 12 - vaddudm 14, 14, 13 # x0 - - vmuleuw 9, 4, 27 - vmuleuw 10, 5, 26 - vmuleuw 11, 6, 3 - vmuleuw 12, 7, 2 - vmuleuw 13, 8, 1 - vaddudm 15, 15, 9 - vaddudm 15, 15, 10 - vaddudm 15, 15, 11 - vaddudm 15, 15, 12 - vaddudm 15, 15, 13 # x1 - - vmuleuw 9, 4, 28 - vmuleuw 10, 5, 27 - vmuleuw 11, 6, 26 - vmuleuw 12, 7, 3 - vmuleuw 13, 8, 2 - vaddudm 16, 16, 9 - vaddudm 16, 16, 10 - vaddudm 16, 16, 11 - vaddudm 16, 16, 12 - vaddudm 16, 16, 13 # x2 - - vmuleuw 9, 4, 29 - vmuleuw 10, 5, 28 - vmuleuw 11, 6, 27 - vmuleuw 12, 7, 26 - vmuleuw 13, 8, 3 - vaddudm 17, 17, 9 - vaddudm 17, 17, 10 - vaddudm 17, 17, 11 - vaddudm 17, 17, 12 - vaddudm 17, 17, 13 # x3 - - vmuleuw 9, 4, 30 - vmuleuw 10, 5, 29 - vmuleuw 11, 6, 28 - vmuleuw 12, 7, 27 - vmuleuw 13, 8, 26 - vaddudm 18, 18, 9 - vaddudm 18, 18, 10 - vaddudm 18, 18, 11 - vaddudm 18, 18, 12 - vaddudm 18, 18, 13 # x4 -.endm - -# -# poly1305_setup_r -# -# setup r^4, r^3, r^2, r vectors -# [r, r^3, r^2, r^4] -# vs0 = [r0,...] -# vs1 = [r1,...] -# vs2 = [r2,...] -# vs3 = [r3,...] -# vs4 = [r4,...] -# vs5 = [r4*5,...] -# vs6 = [r3*5,...] -# vs7 = [r2*5,...] -# vs8 = [r1*5,...] -# -# r0, r4*5, r3*5, r2*5, r1*5; -# r1, r0, r4*5, r3*5, r2*5; -# r2, r1, r0, r4*5, r3*5; -# r3, r2, r1, r0, r4*5; -# r4, r3, r2, r1, r0 ; -# -.macro poly1305_setup_r - - # save r - xxlor 26, 58, 58 - xxlor 27, 59, 59 - xxlor 28, 60, 60 - xxlor 29, 61, 61 - xxlor 30, 62, 62 - - xxlxor 31, 31, 31 - -# [r, r^3, r^2, r^4] - # compute r^2 - vmr 4, 26 - vmr 5, 27 - vmr 6, 28 - vmr 7, 29 - vmr 8, 30 - bl do_mul # r^2 r^1 - xxpermdi 58, 58, 36, 0x3 # r0 - xxpermdi 59, 59, 37, 0x3 # r1 - xxpermdi 60, 60, 38, 0x3 # r2 - xxpermdi 61, 61, 39, 0x3 # r3 - xxpermdi 62, 62, 40, 0x3 # r4 - xxpermdi 36, 36, 36, 0x3 - xxpermdi 37, 37, 37, 0x3 - xxpermdi 38, 38, 38, 0x3 - xxpermdi 39, 39, 39, 0x3 - xxpermdi 40, 40, 40, 0x3 - vspltisb 13, 2 - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 - - bl do_mul # r^4 r^3 - vmrgow 26, 26, 4 - vmrgow 27, 27, 5 - vmrgow 28, 28, 6 - vmrgow 29, 29, 7 - vmrgow 30, 30, 8 - vspltisb 13, 2 - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 - - # r^2 r^4 - xxlor 0, 58, 58 - xxlor 1, 59, 59 - xxlor 2, 60, 60 - xxlor 3, 61, 61 - xxlor 4, 62, 62 - xxlor 5, 32, 32 - xxlor 6, 33, 33 - xxlor 7, 34, 34 - xxlor 8, 35, 35 - - vspltw 9, 26, 3 - vspltw 10, 26, 2 - vmrgow 26, 10, 9 - vspltw 9, 27, 3 - vspltw 10, 27, 2 - vmrgow 27, 10, 9 - vspltw 9, 28, 3 - vspltw 10, 28, 2 - vmrgow 28, 10, 9 - vspltw 9, 29, 3 - vspltw 10, 29, 2 - vmrgow 29, 10, 9 - vspltw 9, 30, 3 - vspltw 10, 30, 2 - vmrgow 30, 10, 9 - - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 -.endm - -SYM_FUNC_START_LOCAL(do_mul) - mul_odd - - # do reduction ( h %= p ) - # carry reduction - vspltisb 9, 2 - vsrd 10, 14, 31 - vsrd 11, 17, 31 - vand 7, 17, 25 - vand 4, 14, 25 - vaddudm 18, 18, 11 - vsrd 12, 18, 31 - vaddudm 15, 15, 10 - - vsrd 11, 15, 31 - vand 8, 18, 25 - vand 5, 15, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 16, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vaddudm 8, 8, 11 - blr -SYM_FUNC_END(do_mul) - -# -# init key -# -.macro do_poly1305_init - addis 10, 2, rmask@toc@ha - addi 10, 10, rmask@toc@l - - ld 11, 0(10) - ld 12, 8(10) - - li 14, 16 - li 15, 32 - addis 10, 2, cnum@toc@ha - addi 10, 10, cnum@toc@l - lvx 25, 0, 10 # v25 - mask - lvx 31, 14, 10 # v31 = 1a - lvx 19, 15, 10 # v19 = 1 << 24 - lxv 24, 48(10) # vs24 - lxv 25, 64(10) # vs25 - - # initialize - # load key from r3 to vectors - ld 9, 24(3) - ld 10, 32(3) - and. 9, 9, 11 - and. 10, 10, 12 - - # break 26 bits - extrdi 14, 9, 26, 38 - extrdi 15, 9, 26, 12 - extrdi 16, 9, 12, 0 - mtvsrdd 58, 0, 14 - insrdi 16, 10, 14, 38 - mtvsrdd 59, 0, 15 - extrdi 17, 10, 26, 24 - mtvsrdd 60, 0, 16 - extrdi 18, 10, 24, 0 - mtvsrdd 61, 0, 17 - mtvsrdd 62, 0, 18 - - # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 - li 9, 5 - mtvsrdd 36, 0, 9 - vmulouw 0, 27, 4 # v0 = rr0 - vmulouw 1, 28, 4 # v1 = rr1 - vmulouw 2, 29, 4 # v2 = rr2 - vmulouw 3, 30, 4 # v3 = rr3 -.endm - -# -# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) -# k = 32 bytes key -# r3 = k (r, s) -# r4 = mlen -# r5 = m -# -SYM_FUNC_START(poly1305_p10le_4blocks) -.align 5 - cmpdi 5, 64 - blt Out_no_poly1305 - - SAVE_REGS - - do_poly1305_init - - li 21, 0 # counter to message - - poly1305_setup_r - - # load previous H state - # break/convert r6 to 26 bits - ld 9, 0(3) - ld 10, 8(3) - ld 19, 16(3) - sldi 19, 19, 24 - mtvsrdd 41, 0, 19 - extrdi 14, 9, 26, 38 - extrdi 15, 9, 26, 12 - extrdi 16, 9, 12, 0 - mtvsrdd 36, 0, 14 - insrdi 16, 10, 14, 38 - mtvsrdd 37, 0, 15 - extrdi 17, 10, 26, 24 - mtvsrdd 38, 0, 16 - extrdi 18, 10, 24, 0 - mtvsrdd 39, 0, 17 - mtvsrdd 40, 0, 18 - vor 8, 8, 9 - - # input m1 m2 - add 20, 4, 21 - xxlor 49, 24, 24 - xxlor 50, 25, 25 - lxvw4x 43, 0, 20 - addi 17, 20, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - vand 9, 14, 25 # a0 - vsrd 10, 14, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - vand 10, 10, 25 # a1 - vspltisb 13, 12 - vand 16, 15, 25 - vsld 12, 16, 13 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vspltisb 13, 14 - vsrd 12, 15, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - vaddudm 20, 4, 9 - vaddudm 21, 5, 10 - vaddudm 22, 6, 11 - vaddudm 23, 7, 12 - vaddudm 24, 8, 13 - - # m3 m4 - addi 17, 17, 16 - lxvw4x 43, 0, 17 - addi 17, 17, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - vand 9, 14, 25 # a0 - vsrd 10, 14, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - vand 10, 10, 25 # a1 - vspltisb 13, 12 - vand 16, 15, 25 - vsld 12, 16, 13 - vspltisb 13, 14 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vsrd 12, 15, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] - vmrgow 4, 9, 20 - vmrgow 5, 10, 21 - vmrgow 6, 11, 22 - vmrgow 7, 12, 23 - vmrgow 8, 13, 24 - vaddudm 8, 8, 19 - - addi 5, 5, -64 # len -= 64 - addi 21, 21, 64 # offset += 64 - - li 9, 64 - divdu 31, 5, 9 - - cmpdi 31, 0 - ble Skip_block_loop - - mtctr 31 - -# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r -# Rewrite the polynominal sum of product as follows, -# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 -# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 -# .... Repeat -# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> -# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r -# -loop_4blocks: - - # Multiply odd words and even words - mul_odd - mul_even - # carry reduction - vspltisb 9, 2 - vsrd 10, 14, 31 - vsrd 11, 17, 31 - vand 7, 17, 25 - vand 4, 14, 25 - vaddudm 18, 18, 11 - vsrd 12, 18, 31 - vaddudm 15, 15, 10 - - vsrd 11, 15, 31 - vand 8, 18, 25 - vand 5, 15, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 16, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vaddudm 8, 8, 11 - - # input m1 m2 m3 m4 - add 20, 4, 21 - xxlor 49, 24, 24 - xxlor 50, 25, 25 - lxvw4x 43, 0, 20 - addi 17, 20, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - addi 17, 17, 16 - lxvw4x 43, 0, 17 - addi 17, 17, 16 - lxvw4x 44, 0, 17 - vperm 17, 11, 12, 17 - vperm 18, 11, 12, 18 - - vand 20, 14, 25 # a0 - vand 9, 17, 25 # a0 - vsrd 21, 14, 31 # >> 26 - vsrd 22, 21, 31 # 12 bits left - vsrd 10, 17, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - - vand 21, 21, 25 # a1 - vand 10, 10, 25 # a1 - - vspltisb 13, 12 - vand 16, 15, 25 - vsld 23, 16, 13 - vor 22, 22, 23 - vand 22, 22, 25 # a2 - vand 16, 18, 25 - vsld 12, 16, 13 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vspltisb 13, 14 - vsrd 23, 15, 13 # >> 14 - vsrd 24, 23, 31 # >> 26, a4 - vand 23, 23, 25 # a3 - vsrd 12, 18, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - vaddudm 4, 4, 20 - vaddudm 5, 5, 21 - vaddudm 6, 6, 22 - vaddudm 7, 7, 23 - vaddudm 8, 8, 24 - - # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] - vmrgow 4, 9, 4 - vmrgow 5, 10, 5 - vmrgow 6, 11, 6 - vmrgow 7, 12, 7 - vmrgow 8, 13, 8 - vaddudm 8, 8, 19 - - addi 5, 5, -64 # len -= 64 - addi 21, 21, 64 # offset += 64 - - bdnz loop_4blocks - -Skip_block_loop: - xxlor 58, 0, 0 - xxlor 59, 1, 1 - xxlor 60, 2, 2 - xxlor 61, 3, 3 - xxlor 62, 4, 4 - xxlor 32, 5, 5 - xxlor 33, 6, 6 - xxlor 34, 7, 7 - xxlor 35, 8, 8 - - # Multiply odd words and even words - mul_odd - mul_even - - # Sum the products. - xxpermdi 41, 31, 46, 0 - xxpermdi 42, 31, 47, 0 - vaddudm 4, 14, 9 - xxpermdi 36, 31, 36, 3 - vaddudm 5, 15, 10 - xxpermdi 37, 31, 37, 3 - xxpermdi 43, 31, 48, 0 - vaddudm 6, 16, 11 - xxpermdi 38, 31, 38, 3 - xxpermdi 44, 31, 49, 0 - vaddudm 7, 17, 12 - xxpermdi 39, 31, 39, 3 - xxpermdi 45, 31, 50, 0 - vaddudm 8, 18, 13 - xxpermdi 40, 31, 40, 3 - - # carry reduction - vspltisb 9, 2 - vsrd 10, 4, 31 - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 8, 8, 11 - vsrd 12, 8, 31 - vaddudm 5, 5, 10 - - vsrd 11, 5, 31 - vand 8, 8, 25 - vand 5, 5, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 6, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vsrd 10, 5, 31 - vand 5, 5, 25 - vaddudm 6, 6, 10 - vaddudm 8, 8, 11 - - b do_final_update - -do_final_update: - # combine 26 bit limbs - # v4, v5, v6, v7 and v8 are 26 bit vectors - vsld 5, 5, 31 - vor 20, 4, 5 - vspltisb 11, 12 - vsrd 12, 6, 11 - vsld 6, 6, 31 - vsld 6, 6, 31 - vor 20, 20, 6 - vspltisb 11, 14 - vsld 7, 7, 11 - vor 21, 7, 12 - mfvsrld 16, 40 # save last 2 bytes - vsld 8, 8, 11 - vsld 8, 8, 31 - vor 21, 21, 8 - mfvsrld 17, 52 - mfvsrld 19, 53 - srdi 16, 16, 24 - - std 17, 0(3) - std 19, 8(3) - stw 16, 16(3) - -Out_loop: - li 3, 0 - - RESTORE_REGS - - blr - -Out_no_poly1305: - li 3, 0 - blr -SYM_FUNC_END(poly1305_p10le_4blocks) - -# -# ======================================================================= -# The following functions implement 64 x 64 bits multiplication poly1305. -# -SYM_FUNC_START_LOCAL(Poly1305_init_64) - # mask 0x0FFFFFFC0FFFFFFC - # mask 0x0FFFFFFC0FFFFFFF - addis 10, 2, rmask@toc@ha - addi 10, 10, rmask@toc@l - ld 11, 0(10) - ld 12, 8(10) - - # initialize - # load key from r3 - ld 9, 24(3) - ld 10, 32(3) - and. 9, 9, 11 # cramp mask r0 - and. 10, 10, 12 # cramp mask r1 - - srdi 21, 10, 2 - add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 - - # setup r and s - li 25, 0 - mtvsrdd 32+0, 9, 19 # r0, s1 - mtvsrdd 32+1, 10, 9 # r1, r0 - mtvsrdd 32+2, 19, 25 # s1 - mtvsrdd 32+3, 9, 25 # r0 - - blr -SYM_FUNC_END(Poly1305_init_64) - -# Poly1305_mult -# v6 = (h0, h1), v8 = h2 -# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 -# -# Output: v7, v10, v11 -# -SYM_FUNC_START_LOCAL(Poly1305_mult) - # - # d0 = h0 * r0 + h1 * s1 - vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 - - # d1 = h0 * r1 + h1 * r0 + h2 * s1 - vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 - vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 - - # d2 = r0 - vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 - blr -SYM_FUNC_END(Poly1305_mult) - -# -# carry reduction -# h %=p -# -# Input: v7, v10, v11 -# Output: r27, r28, r29 -# -SYM_FUNC_START_LOCAL(Carry_reduction) - mfvsrld 27, 32+7 - mfvsrld 28, 32+10 - mfvsrld 29, 32+11 - mfvsrd 20, 32+7 # h0.h - mfvsrd 21, 32+10 # h1.h - - addc 28, 28, 20 - adde 29, 29, 21 - srdi 22, 29, 0x2 - sldi 23, 22, 0x2 - add 23, 23, 22 # (h2 & 3) * 5 - addc 27, 27, 23 # h0 - addze 28, 28 # h1 - andi. 29, 29, 0x3 # h2 - blr -SYM_FUNC_END(Carry_reduction) - -# -# poly1305 multiplication -# h *= r, h %= p -# d0 = h0 * r0 + h1 * s1 -# d1 = h0 * r1 + h1 * r0 + h2 * s1 -# d2 = h0 * r0 -# -# -# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) -# - no highbit if final leftover block (highbit = 0) -# -SYM_FUNC_START(poly1305_64s) - cmpdi 5, 0 - ble Out_no_poly1305_64 - - mflr 0 - std 0, 16(1) - stdu 1,-400(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - # Init poly1305 - bl Poly1305_init_64 - - li 25, 0 # offset to inp and outp - - add 11, 25, 4 - - # load h - # h0, h1, h2? - ld 27, 0(3) - ld 28, 8(3) - lwz 29, 16(3) - - li 30, 16 - divdu 31, 5, 30 - - mtctr 31 - - mr 24, 6 # highbit - -Loop_block_64: - vxor 9, 9, 9 - - ld 20, 0(11) - ld 21, 8(11) - addi 11, 11, 16 - - addc 27, 27, 20 - adde 28, 28, 21 - adde 29, 29, 24 - - li 22, 0 - mtvsrdd 32+6, 27, 28 # h0, h1 - mtvsrdd 32+8, 29, 22 # h2 - - bl Poly1305_mult - - bl Carry_reduction - - bdnz Loop_block_64 - - std 27, 0(3) - std 28, 8(3) - stw 29, 16(3) - - li 3, 0 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 400 - ld 0, 16(1) - mtlr 0 - - blr - -Out_no_poly1305_64: - li 3, 0 - blr -SYM_FUNC_END(poly1305_64s) - -# -# Input: r3 = h, r4 = s, r5 = mac -# mac = h + s -# -SYM_FUNC_START(poly1305_emit_64) - ld 10, 0(3) - ld 11, 8(3) - ld 12, 16(3) - - # compare modulus - # h + 5 + (-p) - mr 6, 10 - mr 7, 11 - mr 8, 12 - addic. 6, 6, 5 - addze 7, 7 - addze 8, 8 - srdi 9, 8, 2 # overflow? - cmpdi 9, 0 - beq Skip_h64 - mr 10, 6 - mr 11, 7 - mr 12, 8 - -Skip_h64: - ld 6, 0(4) - ld 7, 8(4) - addc 10, 10, 6 - adde 11, 11, 7 - addze 12, 12 - - std 10, 0(5) - std 11, 8(5) - blr -SYM_FUNC_END(poly1305_emit_64) - -SYM_DATA_START_LOCAL(RMASK) -.align 5 -rmask: -.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f -cnum: -.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 -.long 0x1a, 0x00, 0x1a, 0x00 -.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 -.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 -.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f -SYM_DATA_END(RMASK) diff --git a/arch/powerpc/lib/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S deleted file mode 100644 index cd99d71dae34..000000000000 --- a/arch/powerpc/lib/crypto/sha256-spe-asm.S +++ /dev/null @@ -1,318 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Fast SHA-256 implementation for SPE instruction set (PPC) - * - * This code makes use of the SPE SIMD instruction set as defined in - * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf - * Implementation is based on optimization guide notes from - * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include - -#define rHP r3 /* pointer to hash values in memory */ -#define rKP r24 /* pointer to round constants */ -#define rWP r4 /* pointer to input data */ - -#define rH0 r5 /* 8 32 bit hash values in 8 registers */ -#define rH1 r6 -#define rH2 r7 -#define rH3 r8 -#define rH4 r9 -#define rH5 r10 -#define rH6 r11 -#define rH7 r12 - -#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ -#define rW1 r15 -#define rW2 r16 -#define rW3 r17 -#define rW4 r18 -#define rW5 r19 -#define rW6 r20 -#define rW7 r21 - -#define rT0 r22 /* 64 bit temporaries */ -#define rT1 r23 -#define rT2 r0 /* 32 bit temporaries */ -#define rT3 r25 - -#define CMP_KN_LOOP -#define CMP_KC_LOOP \ - cmpwi rT1,0; - -#define INITIALIZE \ - stwu r1,-128(r1); /* create stack frame */ \ - evstdw r14,8(r1); /* We must save non volatile */ \ - evstdw r15,16(r1); /* registers. Take the chance */ \ - evstdw r16,24(r1); /* and save the SPE part too */ \ - evstdw r17,32(r1); \ - evstdw r18,40(r1); \ - evstdw r19,48(r1); \ - evstdw r20,56(r1); \ - evstdw r21,64(r1); \ - evstdw r22,72(r1); \ - evstdw r23,80(r1); \ - stw r24,88(r1); /* save normal registers */ \ - stw r25,92(r1); - - -#define FINALIZE \ - evldw r14,8(r1); /* restore SPE registers */ \ - evldw r15,16(r1); \ - evldw r16,24(r1); \ - evldw r17,32(r1); \ - evldw r18,40(r1); \ - evldw r19,48(r1); \ - evldw r20,56(r1); \ - evldw r21,64(r1); \ - evldw r22,72(r1); \ - evldw r23,80(r1); \ - lwz r24,88(r1); /* restore normal registers */ \ - lwz r25,92(r1); \ - xor r0,r0,r0; \ - stw r0,8(r1); /* Delete sensitive data */ \ - stw r0,16(r1); /* that we might have pushed */ \ - stw r0,24(r1); /* from other context that runs */ \ - stw r0,32(r1); /* the same code. Assume that */ \ - stw r0,40(r1); /* the lower part of the GPRs */ \ - stw r0,48(r1); /* was already overwritten on */ \ - stw r0,56(r1); /* the way down to here */ \ - stw r0,64(r1); \ - stw r0,72(r1); \ - stw r0,80(r1); \ - addi r1,r1,128; /* cleanup stack frame */ - -#ifdef __BIG_ENDIAN__ -#define LOAD_DATA(reg, off) \ - lwz reg,off(rWP); /* load data */ -#define NEXT_BLOCK \ - addi rWP,rWP,64; /* increment per block */ -#else -#define LOAD_DATA(reg, off) \ - lwbrx reg,0,rWP; /* load data */ \ - addi rWP,rWP,4; /* increment per word */ -#define NEXT_BLOCK /* nothing to do */ -#endif - -#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ - LOAD_DATA(w, off) /* 1: W */ \ - rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ - rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ - rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ - and rT3,e,f; /* 1: ch = e and f */ \ - xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ - andc rT1,g,e; /* 1: ch' = ~e and g */ \ - lwz rT2,off(rKP); /* 1: K */ \ - xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ - add h,h,rT0; /* 1: temp1 = h + S1 */ \ - add rT3,rT3,w; /* 1: temp1' = ch + w */ \ - rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ - add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ - rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ - add h,h,rT2; /* 1: temp1 = temp1 + K */ \ - rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ - add d,d,h; /* 1: d = d + temp1 */ \ - xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ - evmergelo w,w,w; /* shift W */ \ - or rT2,a,b; /* 1: maj = a or b */ \ - and rT1,a,b; /* 1: maj' = a and b */ \ - and rT2,rT2,c; /* 1: maj = maj and c */ \ - LOAD_DATA(w, off+4) /* 2: W */ \ - or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ - rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ - add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ - rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ - add h,h,rT3; /* 1: h = temp1 + temp2 */ \ - rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ - and rT3,d,e; /* 2: ch = e and f */ \ - xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ - andc rT1,f,d; /* 2: ch' = ~e and g */ \ - lwz rT2,off+4(rKP); /* 2: K */ \ - xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ - add g,g,rT0; /* 2: temp1 = h + S1 */ \ - add rT3,rT3,w; /* 2: temp1' = ch + w */ \ - rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ - add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ - rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ - add g,g,rT2; /* 2: temp1 = temp1 + K */ \ - rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ - or rT2,h,a; /* 2: maj = a or b */ \ - xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ - and rT1,h,a; /* 2: maj' = a and b */ \ - and rT2,rT2,b; /* 2: maj = maj and c */ \ - add c,c,g; /* 2: d = d + temp1 */ \ - or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ - add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ - add g,g,rT3 /* 2: h = temp1 + temp2 */ - -#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ - rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ - evmergelohi rT0,w0,w1; /* w[-15] */ \ - rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ - evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ - xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ - evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ - rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ - evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ - xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ - evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ - add h,h,rT2; /* 1: temp1 = h + S1 */ \ - evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ - and rT2,e,f; /* 1: ch = e and f */ \ - evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ - andc rT3,g,e; /* 1: ch' = ~e and g */ \ - evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ - xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ - evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ - add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ - evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ - rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ - evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ - rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ - evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ - xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ - evldw rT1,off(rKP); /* k */ \ - rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ - evaddw w0,w0,rT0; /* w = w + s1 */ \ - xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ - evmergelohi rT0,w4,w5; /* w[-7] */ \ - and rT3,a,b; /* 1: maj = a and b */ \ - evaddw w0,w0,rT0; /* w = w + w[-7] */ \ - CMP_K##k##_LOOP \ - add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ - evaddw rT1,rT1,w0; /* wk = w + k */ \ - xor rT3,a,b; /* 1: maj = a xor b */ \ - evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ - and rT3,rT3,c; /* 1: maj = maj and c */ \ - add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ - add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ - add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ - add d,d,h; /* 1: d = d + temp1 */ \ - rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ - add h,h,rT2; /* 1: h = temp1 + temp2 */ \ - rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ - rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ - and rT3,d,e; /* 2: ch = e and f */ \ - xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ - andc rT1,f,d; /* 2: ch' = ~e and g */ \ - add g,g,rT0; /* 2: temp1 = h + S1 */ \ - xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ - rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ - add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ - rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ - rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ - or rT2,h,a; /* 2: maj = a or b */ \ - and rT1,h,a; /* 2: maj' = a and b */ \ - and rT2,rT2,b; /* 2: maj = maj and c */ \ - xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ - or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ - add c,c,g; /* 2: d = d + temp1 */ \ - add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ - add g,g,rT3 /* 2: h = temp1 + temp2 */ - -_GLOBAL(ppc_spe_sha256_transform) - INITIALIZE - - mtctr r5 - lwz rH0,0(rHP) - lwz rH1,4(rHP) - lwz rH2,8(rHP) - lwz rH3,12(rHP) - lwz rH4,16(rHP) - lwz rH5,20(rHP) - lwz rH6,24(rHP) - lwz rH7,28(rHP) - -ppc_spe_sha256_main: - lis rKP,PPC_SPE_SHA256_K@ha - addi rKP,rKP,PPC_SPE_SHA256_K@l - - R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) - R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) - R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) - R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) - R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) - R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) - R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) - R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) -ppc_spe_sha256_16_rounds: - addi rKP,rKP,64 - R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, - rW0, rW1, rW4, rW5, rW7, N, 0) - R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, - rW1, rW2, rW5, rW6, rW0, N, 8) - R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, - rW2, rW3, rW6, rW7, rW1, N, 16) - R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, - rW3, rW4, rW7, rW0, rW2, N, 24) - R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, - rW4, rW5, rW0, rW1, rW3, N, 32) - R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, - rW5, rW6, rW1, rW2, rW4, N, 40) - R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, - rW6, rW7, rW2, rW3, rW5, N, 48) - R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, - rW7, rW0, rW3, rW4, rW6, C, 56) - bt gt,ppc_spe_sha256_16_rounds - - lwz rW0,0(rHP) - NEXT_BLOCK - lwz rW1,4(rHP) - lwz rW2,8(rHP) - lwz rW3,12(rHP) - lwz rW4,16(rHP) - lwz rW5,20(rHP) - lwz rW6,24(rHP) - lwz rW7,28(rHP) - - add rH0,rH0,rW0 - stw rH0,0(rHP) - add rH1,rH1,rW1 - stw rH1,4(rHP) - add rH2,rH2,rW2 - stw rH2,8(rHP) - add rH3,rH3,rW3 - stw rH3,12(rHP) - add rH4,rH4,rW4 - stw rH4,16(rHP) - add rH5,rH5,rW5 - stw rH5,20(rHP) - add rH6,rH6,rW6 - stw rH6,24(rHP) - add rH7,rH7,rW7 - stw rH7,28(rHP) - - bdnz ppc_spe_sha256_main - - FINALIZE - blr - -.data -.align 5 -PPC_SPE_SHA256_K: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c deleted file mode 100644 index 6b0f079587eb..000000000000 --- a/arch/powerpc/lib/crypto/sha256.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 Secure Hash Algorithm, SPE optimized - * - * Based on generic implementation. The assembler module takes care - * about the SPE registers so it can run from interrupt context. - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include -#include -#include -#include - -/* - * MAX_BYTES defines the number of bytes that are allowed to be processed - * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 - * operations per 64 bytes. e500 cores can issue two arithmetic instructions - * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). - * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. - * Headroom for cache misses included. Even with the low end model clocked - * at 667 MHz this equals to a critical time window of less than 27us. - * - */ -#define MAX_BYTES 1024 - -extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); - -static void spe_begin(void) -{ - /* We just start SPE operations and will save SPE registers later. */ - preempt_disable(); - enable_kernel_spe(); -} - -static void spe_end(void) -{ - disable_kernel_spe(); - /* reenable preemption */ - preempt_enable(); -} - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - do { - /* cut input data into smaller blocks */ - u32 unit = min_t(size_t, nblocks, - MAX_BYTES / SHA256_BLOCK_SIZE); - - spe_begin(); - ppc_spe_sha256_transform(state, data, unit); - spe_end(); - - data += unit * SHA256_BLOCK_SIZE; - nblocks -= unit; - } while (nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 43c44316fbbd..f4f9a70dd089 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -199,7 +199,7 @@ if MIPS source "lib/crypto/mips/Kconfig" endif if PPC -source "arch/powerpc/lib/crypto/Kconfig" +source "lib/crypto/powerpc/Kconfig" endif if RISCV source "arch/riscv/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index f54d2f3edc40..f5f1dcec2f89 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -110,3 +110,4 @@ libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ obj-$(CONFIG_ARM64) += arm64/ obj-$(CONFIG_MIPS) += mips/ +obj-$(CONFIG_PPC) += powerpc/ diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig new file mode 100644 index 000000000000..3f9e1bbd9905 --- /dev/null +++ b/lib/crypto/powerpc/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + depends on BROKEN # Needs to be fixed to work in softirq context + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + select CRYPTO_LIB_POLY1305_GENERIC + +config CRYPTO_SHA256_PPC_SPE + tristate + depends on SPE + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile new file mode 100644 index 000000000000..27f231f8e334 --- /dev/null +++ b/lib/crypto/powerpc/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o + +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o + +obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o +sha256-ppc-spe-y := sha256.o sha256-spe-asm.o diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c new file mode 100644 index 000000000000..fcd23c6f1590 --- /dev/null +++ b/lib/crypto/powerpc/chacha-p10-glue.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha stream cipher (P10 accelerated) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state->x[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(chacha_p10_init); + +static void __exit chacha_p10_exit(void) +{ +} +module_exit(chacha_p10_exit); + +MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); +MODULE_AUTHOR("Danny Tsen "); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S new file mode 100644 index 000000000000..b29562bd5d40 --- /dev/null +++ b/lib/crypto/powerpc/chacha-p10le-8x.S @@ -0,0 +1,840 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include +#include +#include +#include + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +.macro QT_loop_8x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 4, 4, 25 # + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 20, 20, 25 # + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + xxlor 32+25, 0, 0 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + vrlw 4, 4, 28 # + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 20, 20, 28 # + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + xxlor 32+28, 0, 0 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + + xxlor 32+25, 0, 0 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 4, 4, 25 + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + vrlw 20, 20, 25 + xxlor 32+25, 0, 0 + + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + xxlor 32+25, 0, 0 + + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 4, 4, 28 + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + vrlw 20, 20, 28 + xxlor 32+28, 0, 0 +.endm + +.macro QT_loop_4x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 20 + vpermxor 13, 13, 1, 20 + vpermxor 14, 14, 2, 20 + vpermxor 15, 15, 3, 20 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 21 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 22 + vpermxor 13, 13, 1, 22 + vpermxor 14, 14, 2, 22 + vpermxor 15, 15, 3, 22 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 23 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 20 + vpermxor 12, 12, 1, 20 + vpermxor 13, 13, 2, 20 + vpermxor 14, 14, 3, 20 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vrlw 4, 4, 21 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 22 + vpermxor 12, 12, 1, 22 + vpermxor 13, 13, 2, 22 + vpermxor 14, 14, 3, 22 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + vrlw 4, 4, 23 +.endm + +# Transpose +.macro TP_4x a0 a1 a2 a3 + xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 + xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 + xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 + xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 + xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 + xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 + xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 + xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 +.endm + +# key stream = working state + state +.macro Add_state S + vadduwm \S+0, \S+0, 16-\S + vadduwm \S+4, \S+4, 17-\S + vadduwm \S+8, \S+8, 18-\S + vadduwm \S+12, \S+12, 19-\S + + vadduwm \S+1, \S+1, 16-\S + vadduwm \S+5, \S+5, 17-\S + vadduwm \S+9, \S+9, 18-\S + vadduwm \S+13, \S+13, 19-\S + + vadduwm \S+2, \S+2, 16-\S + vadduwm \S+6, \S+6, 17-\S + vadduwm \S+10, \S+10, 18-\S + vadduwm \S+14, \S+14, 19-\S + + vadduwm \S+3, \S+3, 16-\S + vadduwm \S+7, \S+7, 17-\S + vadduwm \S+11, \S+11, 18-\S + vadduwm \S+15, \S+15, 19-\S +.endm + +# +# write 256 bytes +# +.macro Write_256 S + add 9, 14, 5 + add 16, 14, 4 + lxvw4x 0, 0, 9 + lxvw4x 1, 17, 9 + lxvw4x 2, 18, 9 + lxvw4x 3, 19, 9 + lxvw4x 4, 20, 9 + lxvw4x 5, 21, 9 + lxvw4x 6, 22, 9 + lxvw4x 7, 23, 9 + lxvw4x 8, 24, 9 + lxvw4x 9, 25, 9 + lxvw4x 10, 26, 9 + lxvw4x 11, 27, 9 + lxvw4x 12, 28, 9 + lxvw4x 13, 29, 9 + lxvw4x 14, 30, 9 + lxvw4x 15, 31, 9 + + xxlxor \S+32, \S+32, 0 + xxlxor \S+36, \S+36, 1 + xxlxor \S+40, \S+40, 2 + xxlxor \S+44, \S+44, 3 + xxlxor \S+33, \S+33, 4 + xxlxor \S+37, \S+37, 5 + xxlxor \S+41, \S+41, 6 + xxlxor \S+45, \S+45, 7 + xxlxor \S+34, \S+34, 8 + xxlxor \S+38, \S+38, 9 + xxlxor \S+42, \S+42, 10 + xxlxor \S+46, \S+46, 11 + xxlxor \S+35, \S+35, 12 + xxlxor \S+39, \S+39, 13 + xxlxor \S+43, \S+43, 14 + xxlxor \S+47, \S+47, 15 + + stxvw4x \S+32, 0, 16 + stxvw4x \S+36, 17, 16 + stxvw4x \S+40, 18, 16 + stxvw4x \S+44, 19, 16 + + stxvw4x \S+33, 20, 16 + stxvw4x \S+37, 21, 16 + stxvw4x \S+41, 22, 16 + stxvw4x \S+45, 23, 16 + + stxvw4x \S+34, 24, 16 + stxvw4x \S+38, 25, 16 + stxvw4x \S+42, 26, 16 + stxvw4x \S+46, 27, 16 + + stxvw4x \S+35, 28, 16 + stxvw4x \S+39, 29, 16 + stxvw4x \S+43, 30, 16 + stxvw4x \S+47, 31, 16 + +.endm + +# +# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src, +# unsigned int len, int nrounds); +# +SYM_FUNC_START(chacha_p10le_8x) +.align 5 + cmpdi 6, 0 + ble Out_no_chacha + + SAVE_REGS + + # r17 - r31 mainly for Write_256 macro. + li 17, 16 + li 18, 32 + li 19, 48 + li 20, 64 + li 21, 80 + li 22, 96 + li 23, 112 + li 24, 128 + li 25, 144 + li 26, 160 + li 27, 176 + li 28, 192 + li 29, 208 + li 30, 224 + li 31, 240 + + mr 15, 6 # len + li 14, 0 # offset to inp and outp + + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + # create (0, 1, 2, 3) counters + vspltisw 0, 0 + vspltisw 1, 1 + vspltisw 2, 2 + vspltisw 3, 3 + vmrghw 4, 0, 1 + vmrglw 5, 2, 3 + vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) + + vspltisw 21, 12 + vspltisw 23, 7 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + + mtctr 8 + + # save constants to vsx + xxlor 16, 48, 48 + xxlor 17, 49, 49 + xxlor 18, 50, 50 + xxlor 19, 51, 51 + + vspltisw 25, 4 + vspltisw 26, 8 + + xxlor 25, 32+26, 32+26 + xxlor 24, 32+25, 32+25 + + vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + xxlor 20, 32+20, 32+20 + xxlor 21, 32+21, 32+21 + xxlor 22, 32+22, 32+22 + xxlor 23, 32+23, 32+23 + + cmpdi 6, 512 + blt Loop_last + +Loop_8x: + xxspltw 32+0, 16, 0 + xxspltw 32+1, 16, 1 + xxspltw 32+2, 16, 2 + xxspltw 32+3, 16, 3 + + xxspltw 32+4, 17, 0 + xxspltw 32+5, 17, 1 + xxspltw 32+6, 17, 2 + xxspltw 32+7, 17, 3 + xxspltw 32+8, 18, 0 + xxspltw 32+9, 18, 1 + xxspltw 32+10, 18, 2 + xxspltw 32+11, 18, 3 + xxspltw 32+12, 19, 0 + xxspltw 32+13, 19, 1 + xxspltw 32+14, 19, 2 + xxspltw 32+15, 19, 3 + vadduwm 12, 12, 30 # increase counter + + xxspltw 32+16, 16, 0 + xxspltw 32+17, 16, 1 + xxspltw 32+18, 16, 2 + xxspltw 32+19, 16, 3 + + xxspltw 32+20, 17, 0 + xxspltw 32+21, 17, 1 + xxspltw 32+22, 17, 2 + xxspltw 32+23, 17, 3 + xxspltw 32+24, 18, 0 + xxspltw 32+25, 18, 1 + xxspltw 32+26, 18, 2 + xxspltw 32+27, 18, 3 + xxspltw 32+28, 19, 0 + xxspltw 32+29, 19, 1 + vadduwm 28, 28, 31 # increase counter + xxspltw 32+30, 19, 2 + xxspltw 32+31, 19, 3 + +.align 5 +quarter_loop_8x: + QT_loop_8x + + bdnz quarter_loop_8x + + xxlor 0, 32+30, 32+30 + xxlor 32+30, 30, 30 + vadduwm 12, 12, 30 + xxlor 32+30, 0, 0 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + xxlor 0, 48, 48 + xxlor 1, 49, 49 + xxlor 2, 50, 50 + xxlor 3, 51, 51 + xxlor 48, 16, 16 + xxlor 49, 17, 17 + xxlor 50, 18, 18 + xxlor 51, 19, 19 + Add_state 0 + xxlor 48, 0, 0 + xxlor 49, 1, 1 + xxlor 50, 2, 2 + xxlor 51, 3, 3 + Write_256 0 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len -=256 + + xxlor 5, 32+31, 32+31 + xxlor 32+31, 31, 31 + vadduwm 28, 28, 31 + xxlor 32+31, 5, 5 + TP_4x 16+0, 16+1, 16+2, 16+3 + TP_4x 16+4, 16+5, 16+6, 16+7 + TP_4x 16+8, 16+9, 16+10, 16+11 + TP_4x 16+12, 16+13, 16+14, 16+15 + + xxlor 32, 16, 16 + xxlor 33, 17, 17 + xxlor 34, 18, 18 + xxlor 35, 19, 19 + Add_state 16 + Write_256 16 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len +=256 + + xxlor 32+24, 24, 24 + xxlor 32+25, 25, 25 + xxlor 32+30, 30, 30 + vadduwm 30, 30, 25 + vadduwm 31, 30, 24 + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + cmpdi 15, 0 + beq Out_loop + + cmpdi 15, 512 + blt Loop_last + + mtctr 8 + b Loop_8x + +Loop_last: + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + vspltisw 21, 12 + vspltisw 23, 7 + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + mtctr 8 + +Loop_4x: + vspltw 0, 16, 0 + vspltw 1, 16, 1 + vspltw 2, 16, 2 + vspltw 3, 16, 3 + + vspltw 4, 17, 0 + vspltw 5, 17, 1 + vspltw 6, 17, 2 + vspltw 7, 17, 3 + vspltw 8, 18, 0 + vspltw 9, 18, 1 + vspltw 10, 18, 2 + vspltw 11, 18, 3 + vspltw 12, 19, 0 + vadduwm 12, 12, 30 # increase counter + vspltw 13, 19, 1 + vspltw 14, 19, 2 + vspltw 15, 19, 3 + +.align 5 +quarter_loop: + QT_loop_4x + + bdnz quarter_loop + + vadduwm 12, 12, 30 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + Add_state 0 + Write_256 0 + addi 14, 14, 256 # offset += 256 + addi 15, 15, -256 # len += 256 + + # Update state counter + vspltisw 25, 4 + vadduwm 30, 30, 25 + + cmpdi 15, 0 + beq Out_loop + cmpdi 15, 256 + blt Out_loop + + mtctr 8 + b Loop_4x + +Out_loop: + RESTORE_REGS + blr + +Out_no_chacha: + li 3, 0 + blr +SYM_FUNC_END(chacha_p10le_8x) + +SYM_DATA_START_LOCAL(PERMX) +.align 5 +permx: +.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd +.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc +SYM_DATA_END(PERMX) diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c new file mode 100644 index 000000000000..3f1664a724b6 --- /dev/null +++ b/lib/crypto/powerpc/poly1305-p10-glue.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Corp. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +void poly1305_block_init_arch(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_block_init_generic(dctx, raw_key); + + dctx->h = (struct poly1305_state){}; + dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); + dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); +} +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_blocks_generic(state, src, len, padbit); + vsx_begin(); + if (len >= POLY1305_BLOCK_SIZE * 4) { + poly1305_p10le_4blocks(state, src, len); + src += len - (len % (POLY1305_BLOCK_SIZE * 4)); + len %= POLY1305_BLOCK_SIZE * 4; + } + while (len >= POLY1305_BLOCK_SIZE) { + poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); + len -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + vsx_end(); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_emit_generic(state, digest, nonce); + poly1305_emit_64(state, nonce, digest); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(poly1305_p10_init); + +static void __exit poly1305_p10_exit(void) +{ +} +module_exit(poly1305_p10_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Danny Tsen "); +MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S new file mode 100644 index 000000000000..a3c1987f1ecd --- /dev/null +++ b/lib/crypto/powerpc/poly1305-p10le_64.S @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated poly1305 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen +# +# Poly1305 - this version mainly using vector/VSX/Scalar +# - 26 bits limbs +# - Handle multiple 64 byte blcok. +# +# Block size 16 bytes +# key = (r, s) +# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF +# p = 2^130 - 5 +# a += m +# a = (r + a) % p +# a += s +# +# Improve performance by breaking down polynominal to the sum of products with +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# +# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 +# to 9 vectors for multiplications. +# +# setup r^4, r^3, r^2, r vectors +# vs [r^1, r^3, r^2, r^4] +# vs0 = [r0,.....] +# vs1 = [r1,.....] +# vs2 = [r2,.....] +# vs3 = [r3,.....] +# vs4 = [r4,.....] +# vs5 = [r1*5,...] +# vs6 = [r2*5,...] +# vs7 = [r2*5,...] +# vs8 = [r4*5,...] +# +# Each word in a vector consists a member of a "r/s" in [a * r/s]. +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +#include +#include +#include +#include + +.machine "any" + +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# +# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; +# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; +# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; +# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; +# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; +# +# [r^2, r^3, r^1, r^4] +# [m3, m2, m4, m1] +# +# multiply odd and even words +.macro mul_odd + vmulouw 14, 4, 26 + vmulouw 10, 5, 3 + vmulouw 11, 6, 2 + vmulouw 12, 7, 1 + vmulouw 13, 8, 0 + vmulouw 15, 4, 27 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vmulouw 10, 5, 26 + vmulouw 11, 6, 3 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vmulouw 12, 7, 2 + vmulouw 13, 8, 1 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + vmulouw 16, 4, 28 + vmulouw 10, 5, 27 + vmulouw 11, 6, 26 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vmulouw 12, 7, 3 + vmulouw 13, 8, 2 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + vmulouw 17, 4, 29 + vmulouw 10, 5, 28 + vmulouw 11, 6, 27 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vmulouw 12, 7, 26 + vmulouw 13, 8, 3 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + vmulouw 18, 4, 30 + vmulouw 10, 5, 29 + vmulouw 11, 6, 28 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vmulouw 12, 7, 27 + vmulouw 13, 8, 26 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +.macro mul_even + vmuleuw 9, 4, 26 + vmuleuw 10, 5, 3 + vmuleuw 11, 6, 2 + vmuleuw 12, 7, 1 + vmuleuw 13, 8, 0 + vaddudm 14, 14, 9 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + + vmuleuw 9, 4, 27 + vmuleuw 10, 5, 26 + vmuleuw 11, 6, 3 + vmuleuw 12, 7, 2 + vmuleuw 13, 8, 1 + vaddudm 15, 15, 9 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + + vmuleuw 9, 4, 28 + vmuleuw 10, 5, 27 + vmuleuw 11, 6, 26 + vmuleuw 12, 7, 3 + vmuleuw 13, 8, 2 + vaddudm 16, 16, 9 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + + vmuleuw 9, 4, 29 + vmuleuw 10, 5, 28 + vmuleuw 11, 6, 27 + vmuleuw 12, 7, 26 + vmuleuw 13, 8, 3 + vaddudm 17, 17, 9 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + + vmuleuw 9, 4, 30 + vmuleuw 10, 5, 29 + vmuleuw 11, 6, 28 + vmuleuw 12, 7, 27 + vmuleuw 13, 8, 26 + vaddudm 18, 18, 9 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +# +# poly1305_setup_r +# +# setup r^4, r^3, r^2, r vectors +# [r, r^3, r^2, r^4] +# vs0 = [r0,...] +# vs1 = [r1,...] +# vs2 = [r2,...] +# vs3 = [r3,...] +# vs4 = [r4,...] +# vs5 = [r4*5,...] +# vs6 = [r3*5,...] +# vs7 = [r2*5,...] +# vs8 = [r1*5,...] +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +.macro poly1305_setup_r + + # save r + xxlor 26, 58, 58 + xxlor 27, 59, 59 + xxlor 28, 60, 60 + xxlor 29, 61, 61 + xxlor 30, 62, 62 + + xxlxor 31, 31, 31 + +# [r, r^3, r^2, r^4] + # compute r^2 + vmr 4, 26 + vmr 5, 27 + vmr 6, 28 + vmr 7, 29 + vmr 8, 30 + bl do_mul # r^2 r^1 + xxpermdi 58, 58, 36, 0x3 # r0 + xxpermdi 59, 59, 37, 0x3 # r1 + xxpermdi 60, 60, 38, 0x3 # r2 + xxpermdi 61, 61, 39, 0x3 # r3 + xxpermdi 62, 62, 40, 0x3 # r4 + xxpermdi 36, 36, 36, 0x3 + xxpermdi 37, 37, 37, 0x3 + xxpermdi 38, 38, 38, 0x3 + xxpermdi 39, 39, 39, 0x3 + xxpermdi 40, 40, 40, 0x3 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + bl do_mul # r^4 r^3 + vmrgow 26, 26, 4 + vmrgow 27, 27, 5 + vmrgow 28, 28, 6 + vmrgow 29, 29, 7 + vmrgow 30, 30, 8 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + # r^2 r^4 + xxlor 0, 58, 58 + xxlor 1, 59, 59 + xxlor 2, 60, 60 + xxlor 3, 61, 61 + xxlor 4, 62, 62 + xxlor 5, 32, 32 + xxlor 6, 33, 33 + xxlor 7, 34, 34 + xxlor 8, 35, 35 + + vspltw 9, 26, 3 + vspltw 10, 26, 2 + vmrgow 26, 10, 9 + vspltw 9, 27, 3 + vspltw 10, 27, 2 + vmrgow 27, 10, 9 + vspltw 9, 28, 3 + vspltw 10, 28, 2 + vmrgow 28, 10, 9 + vspltw 9, 29, 3 + vspltw 10, 29, 2 + vmrgow 29, 10, 9 + vspltw 9, 30, 3 + vspltw 10, 30, 2 + vmrgow 30, 10, 9 + + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 +.endm + +SYM_FUNC_START_LOCAL(do_mul) + mul_odd + + # do reduction ( h %= p ) + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + blr +SYM_FUNC_END(do_mul) + +# +# init key +# +.macro do_poly1305_init + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + + ld 11, 0(10) + ld 12, 8(10) + + li 14, 16 + li 15, 32 + addis 10, 2, cnum@toc@ha + addi 10, 10, cnum@toc@l + lvx 25, 0, 10 # v25 - mask + lvx 31, 14, 10 # v31 = 1a + lvx 19, 15, 10 # v19 = 1 << 24 + lxv 24, 48(10) # vs24 + lxv 25, 64(10) # vs25 + + # initialize + # load key from r3 to vectors + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 + and. 10, 10, 12 + + # break 26 bits + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 58, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 59, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 60, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 61, 0, 17 + mtvsrdd 62, 0, 18 + + # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 + li 9, 5 + mtvsrdd 36, 0, 9 + vmulouw 0, 27, 4 # v0 = rr0 + vmulouw 1, 28, 4 # v1 = rr1 + vmulouw 2, 29, 4 # v2 = rr2 + vmulouw 3, 30, 4 # v3 = rr3 +.endm + +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +SYM_FUNC_START(poly1305_p10le_4blocks) +.align 5 + cmpdi 5, 64 + blt Out_no_poly1305 + + SAVE_REGS + + do_poly1305_init + + li 21, 0 # counter to message + + poly1305_setup_r + + # load previous H state + # break/convert r6 to 26 bits + ld 9, 0(3) + ld 10, 8(3) + ld 19, 16(3) + sldi 19, 19, 24 + mtvsrdd 41, 0, 19 + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 36, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 37, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 38, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 39, 0, 17 + mtvsrdd 40, 0, 18 + vor 8, 8, 9 + + # input m1 m2 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 20, 4, 9 + vaddudm 21, 5, 10 + vaddudm 22, 6, 11 + vaddudm 23, 7, 12 + vaddudm 24, 8, 13 + + # m3 m4 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vspltisb 13, 14 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 20 + vmrgow 5, 10, 21 + vmrgow 6, 11, 22 + vmrgow 7, 12, 23 + vmrgow 8, 13, 24 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + li 9, 64 + divdu 31, 5, 9 + + cmpdi 31, 0 + ble Skip_block_loop + + mtctr 31 + +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# Rewrite the polynominal sum of product as follows, +# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 +# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 +# .... Repeat +# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> +# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r +# +loop_4blocks: + + # Multiply odd words and even words + mul_odd + mul_even + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + + # input m1 m2 m3 m4 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 17, 11, 12, 17 + vperm 18, 11, 12, 18 + + vand 20, 14, 25 # a0 + vand 9, 17, 25 # a0 + vsrd 21, 14, 31 # >> 26 + vsrd 22, 21, 31 # 12 bits left + vsrd 10, 17, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + + vand 21, 21, 25 # a1 + vand 10, 10, 25 # a1 + + vspltisb 13, 12 + vand 16, 15, 25 + vsld 23, 16, 13 + vor 22, 22, 23 + vand 22, 22, 25 # a2 + vand 16, 18, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 23, 15, 13 # >> 14 + vsrd 24, 23, 31 # >> 26, a4 + vand 23, 23, 25 # a3 + vsrd 12, 18, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 4, 4, 20 + vaddudm 5, 5, 21 + vaddudm 6, 6, 22 + vaddudm 7, 7, 23 + vaddudm 8, 8, 24 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 4 + vmrgow 5, 10, 5 + vmrgow 6, 11, 6 + vmrgow 7, 12, 7 + vmrgow 8, 13, 8 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + bdnz loop_4blocks + +Skip_block_loop: + xxlor 58, 0, 0 + xxlor 59, 1, 1 + xxlor 60, 2, 2 + xxlor 61, 3, 3 + xxlor 62, 4, 4 + xxlor 32, 5, 5 + xxlor 33, 6, 6 + xxlor 34, 7, 7 + xxlor 35, 8, 8 + + # Multiply odd words and even words + mul_odd + mul_even + + # Sum the products. + xxpermdi 41, 31, 46, 0 + xxpermdi 42, 31, 47, 0 + vaddudm 4, 14, 9 + xxpermdi 36, 31, 36, 3 + vaddudm 5, 15, 10 + xxpermdi 37, 31, 37, 3 + xxpermdi 43, 31, 48, 0 + vaddudm 6, 16, 11 + xxpermdi 38, 31, 38, 3 + xxpermdi 44, 31, 49, 0 + vaddudm 7, 17, 12 + xxpermdi 39, 31, 39, 3 + xxpermdi 45, 31, 50, 0 + vaddudm 8, 18, 13 + xxpermdi 40, 31, 40, 3 + + # carry reduction + vspltisb 9, 2 + vsrd 10, 4, 31 + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 8, 8, 11 + vsrd 12, 8, 31 + vaddudm 5, 5, 10 + + vsrd 11, 5, 31 + vand 8, 8, 25 + vand 5, 5, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 6, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vsrd 10, 5, 31 + vand 5, 5, 25 + vaddudm 6, 6, 10 + vaddudm 8, 8, 11 + + b do_final_update + +do_final_update: + # combine 26 bit limbs + # v4, v5, v6, v7 and v8 are 26 bit vectors + vsld 5, 5, 31 + vor 20, 4, 5 + vspltisb 11, 12 + vsrd 12, 6, 11 + vsld 6, 6, 31 + vsld 6, 6, 31 + vor 20, 20, 6 + vspltisb 11, 14 + vsld 7, 7, 11 + vor 21, 7, 12 + mfvsrld 16, 40 # save last 2 bytes + vsld 8, 8, 11 + vsld 8, 8, 31 + vor 21, 21, 8 + mfvsrld 17, 52 + mfvsrld 19, 53 + srdi 16, 16, 24 + + std 17, 0(3) + std 19, 8(3) + stw 16, 16(3) + +Out_loop: + li 3, 0 + + RESTORE_REGS + + blr + +Out_no_poly1305: + li 3, 0 + blr +SYM_FUNC_END(poly1305_p10le_4blocks) + +# +# ======================================================================= +# The following functions implement 64 x 64 bits multiplication poly1305. +# +SYM_FUNC_START_LOCAL(Poly1305_init_64) + # mask 0x0FFFFFFC0FFFFFFC + # mask 0x0FFFFFFC0FFFFFFF + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + ld 11, 0(10) + ld 12, 8(10) + + # initialize + # load key from r3 + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 # cramp mask r0 + and. 10, 10, 12 # cramp mask r1 + + srdi 21, 10, 2 + add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 + + # setup r and s + li 25, 0 + mtvsrdd 32+0, 9, 19 # r0, s1 + mtvsrdd 32+1, 10, 9 # r1, r0 + mtvsrdd 32+2, 19, 25 # s1 + mtvsrdd 32+3, 9, 25 # r0 + + blr +SYM_FUNC_END(Poly1305_init_64) + +# Poly1305_mult +# v6 = (h0, h1), v8 = h2 +# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 +# +# Output: v7, v10, v11 +# +SYM_FUNC_START_LOCAL(Poly1305_mult) + # + # d0 = h0 * r0 + h1 * s1 + vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 + + # d1 = h0 * r1 + h1 * r0 + h2 * s1 + vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 + vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 + + # d2 = r0 + vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 + blr +SYM_FUNC_END(Poly1305_mult) + +# +# carry reduction +# h %=p +# +# Input: v7, v10, v11 +# Output: r27, r28, r29 +# +SYM_FUNC_START_LOCAL(Carry_reduction) + mfvsrld 27, 32+7 + mfvsrld 28, 32+10 + mfvsrld 29, 32+11 + mfvsrd 20, 32+7 # h0.h + mfvsrd 21, 32+10 # h1.h + + addc 28, 28, 20 + adde 29, 29, 21 + srdi 22, 29, 0x2 + sldi 23, 22, 0x2 + add 23, 23, 22 # (h2 & 3) * 5 + addc 27, 27, 23 # h0 + addze 28, 28 # h1 + andi. 29, 29, 0x3 # h2 + blr +SYM_FUNC_END(Carry_reduction) + +# +# poly1305 multiplication +# h *= r, h %= p +# d0 = h0 * r0 + h1 * s1 +# d1 = h0 * r1 + h1 * r0 + h2 * s1 +# d2 = h0 * r0 +# +# +# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) +# - no highbit if final leftover block (highbit = 0) +# +SYM_FUNC_START(poly1305_64s) + cmpdi 5, 0 + ble Out_no_poly1305_64 + + mflr 0 + std 0, 16(1) + stdu 1,-400(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + # Init poly1305 + bl Poly1305_init_64 + + li 25, 0 # offset to inp and outp + + add 11, 25, 4 + + # load h + # h0, h1, h2? + ld 27, 0(3) + ld 28, 8(3) + lwz 29, 16(3) + + li 30, 16 + divdu 31, 5, 30 + + mtctr 31 + + mr 24, 6 # highbit + +Loop_block_64: + vxor 9, 9, 9 + + ld 20, 0(11) + ld 21, 8(11) + addi 11, 11, 16 + + addc 27, 27, 20 + adde 28, 28, 21 + adde 29, 29, 24 + + li 22, 0 + mtvsrdd 32+6, 27, 28 # h0, h1 + mtvsrdd 32+8, 29, 22 # h2 + + bl Poly1305_mult + + bl Carry_reduction + + bdnz Loop_block_64 + + std 27, 0(3) + std 28, 8(3) + stw 29, 16(3) + + li 3, 0 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 400 + ld 0, 16(1) + mtlr 0 + + blr + +Out_no_poly1305_64: + li 3, 0 + blr +SYM_FUNC_END(poly1305_64s) + +# +# Input: r3 = h, r4 = s, r5 = mac +# mac = h + s +# +SYM_FUNC_START(poly1305_emit_64) + ld 10, 0(3) + ld 11, 8(3) + ld 12, 16(3) + + # compare modulus + # h + 5 + (-p) + mr 6, 10 + mr 7, 11 + mr 8, 12 + addic. 6, 6, 5 + addze 7, 7 + addze 8, 8 + srdi 9, 8, 2 # overflow? + cmpdi 9, 0 + beq Skip_h64 + mr 10, 6 + mr 11, 7 + mr 12, 8 + +Skip_h64: + ld 6, 0(4) + ld 7, 8(4) + addc 10, 10, 6 + adde 11, 11, 7 + addze 12, 12 + + std 10, 0(5) + std 11, 8(5) + blr +SYM_FUNC_END(poly1305_emit_64) + +SYM_DATA_START_LOCAL(RMASK) +.align 5 +rmask: +.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f +cnum: +.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 +.long 0x1a, 0x00, 0x1a, 0x00 +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 +.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 +.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f +SYM_DATA_END(RMASK) diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S new file mode 100644 index 000000000000..cd99d71dae34 --- /dev/null +++ b/lib/crypto/powerpc/sha256-spe-asm.S @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fast SHA-256 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen + */ + +#include +#include + +#define rHP r3 /* pointer to hash values in memory */ +#define rKP r24 /* pointer to round constants */ +#define rWP r4 /* pointer to input data */ + +#define rH0 r5 /* 8 32 bit hash values in 8 registers */ +#define rH1 r6 +#define rH2 r7 +#define rH3 r8 +#define rH4 r9 +#define rH5 r10 +#define rH6 r11 +#define rH7 r12 + +#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rT0 r22 /* 64 bit temporaries */ +#define rT1 r23 +#define rT2 r0 /* 32 bit temporaries */ +#define rT3 r25 + +#define CMP_KN_LOOP +#define CMP_KC_LOOP \ + cmpwi rT1,0; + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); \ + stw r24,88(r1); /* save normal registers */ \ + stw r25,92(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + lwz r24,88(r1); /* restore normal registers */ \ + lwz r25,92(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* was already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ + LOAD_DATA(w, off) /* 1: W */ \ + rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ + rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ + rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ + and rT3,e,f; /* 1: ch = e and f */ \ + xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ + andc rT1,g,e; /* 1: ch' = ~e and g */ \ + lwz rT2,off(rKP); /* 1: K */ \ + xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ + add h,h,rT0; /* 1: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 1: temp1' = ch + w */ \ + rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ + add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ + rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + K */ \ + rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ + evmergelo w,w,w; /* shift W */ \ + or rT2,a,b; /* 1: maj = a or b */ \ + and rT1,a,b; /* 1: maj' = a and b */ \ + and rT2,rT2,c; /* 1: maj = maj and c */ \ + LOAD_DATA(w, off+4) /* 2: W */ \ + or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + add h,h,rT3; /* 1: h = temp1 + temp2 */ \ + rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + lwz rT2,off+4(rKP); /* 2: K */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 2: temp1' = ch + w */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + add g,g,rT2; /* 2: temp1 = temp1 + K */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ + rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ + evmergelohi rT0,w0,w1; /* w[-15] */ \ + rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ + evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ + rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ + evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ + add h,h,rT2; /* 1: temp1 = h + S1 */ \ + evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ + and rT2,e,f; /* 1: ch = e and f */ \ + evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ + andc rT3,g,e; /* 1: ch' = ~e and g */ \ + evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ + xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ + evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ + evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ + rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evldw rT1,off(rKP); /* k */ \ + rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ + evaddw w0,w0,rT0; /* w = w + s1 */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evmergelohi rT0,w4,w5; /* w[-7] */ \ + and rT3,a,b; /* 1: maj = a and b */ \ + evaddw w0,w0,rT0; /* w = w + w[-7] */ \ + CMP_K##k##_LOOP \ + add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ + evaddw rT1,rT1,w0; /* wk = w + k */ \ + xor rT3,a,b; /* 1: maj = a xor b */ \ + evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ + and rT3,rT3,c; /* 1: maj = maj and c */ \ + add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ + add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ + add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add h,h,rT2; /* 1: h = temp1 + temp2 */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +_GLOBAL(ppc_spe_sha256_transform) + INITIALIZE + + mtctr r5 + lwz rH0,0(rHP) + lwz rH1,4(rHP) + lwz rH2,8(rHP) + lwz rH3,12(rHP) + lwz rH4,16(rHP) + lwz rH5,20(rHP) + lwz rH6,24(rHP) + lwz rH7,28(rHP) + +ppc_spe_sha256_main: + lis rKP,PPC_SPE_SHA256_K@ha + addi rKP,rKP,PPC_SPE_SHA256_K@l + + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) +ppc_spe_sha256_16_rounds: + addi rKP,rKP,64 + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW0, rW1, rW4, rW5, rW7, N, 0) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW1, rW2, rW5, rW6, rW0, N, 8) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW2, rW3, rW6, rW7, rW1, N, 16) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW3, rW4, rW7, rW0, rW2, N, 24) + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW4, rW5, rW0, rW1, rW3, N, 32) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW5, rW6, rW1, rW2, rW4, N, 40) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW6, rW7, rW2, rW3, rW5, N, 48) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW7, rW0, rW3, rW4, rW6, C, 56) + bt gt,ppc_spe_sha256_16_rounds + + lwz rW0,0(rHP) + NEXT_BLOCK + lwz rW1,4(rHP) + lwz rW2,8(rHP) + lwz rW3,12(rHP) + lwz rW4,16(rHP) + lwz rW5,20(rHP) + lwz rW6,24(rHP) + lwz rW7,28(rHP) + + add rH0,rH0,rW0 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + add rH5,rH5,rW5 + stw rH5,20(rHP) + add rH6,rH6,rW6 + stw rH6,24(rHP) + add rH7,rH7,rW7 + stw rH7,28(rHP) + + bdnz ppc_spe_sha256_main + + FINALIZE + blr + +.data +.align 5 +PPC_SPE_SHA256_K: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c new file mode 100644 index 000000000000..6b0f079587eb --- /dev/null +++ b/lib/crypto/powerpc/sha256.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 Secure Hash Algorithm, SPE optimized + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen + */ + +#include +#include +#include +#include +#include + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 1024 + +extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + do { + /* cut input data into smaller blocks */ + u32 unit = min_t(size_t, nblocks, + MAX_BYTES / SHA256_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha256_transform(state, data, unit); + spe_end(); + + data += unit * SHA256_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized"); -- cgit v1.2.3 From daed4fcf04db425e44d377a425424752881a2a68 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:04 -0700 Subject: lib/crypto: riscv: Move arch/riscv/lib/crypto/ into lib/crypto/ Move the contents of arch/riscv/lib/crypto/ into lib/crypto/riscv/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Acked-by: Ard Biesheuvel Acked-by: Palmer Dabbelt Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/riscv/lib/Makefile | 1 - arch/riscv/lib/crypto/Kconfig | 16 -- arch/riscv/lib/crypto/Makefile | 7 - arch/riscv/lib/crypto/chacha-riscv64-glue.c | 75 ------ arch/riscv/lib/crypto/chacha-riscv64-zvkb.S | 297 --------------------- .../crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S | 225 ---------------- arch/riscv/lib/crypto/sha256.c | 67 ----- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/riscv/Kconfig | 16 ++ lib/crypto/riscv/Makefile | 7 + lib/crypto/riscv/chacha-riscv64-glue.c | 75 ++++++ lib/crypto/riscv/chacha-riscv64-zvkb.S | 297 +++++++++++++++++++++ .../riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S | 225 ++++++++++++++++ lib/crypto/riscv/sha256.c | 67 +++++ 15 files changed, 689 insertions(+), 689 deletions(-) delete mode 100644 arch/riscv/lib/crypto/Kconfig delete mode 100644 arch/riscv/lib/crypto/Makefile delete mode 100644 arch/riscv/lib/crypto/chacha-riscv64-glue.c delete mode 100644 arch/riscv/lib/crypto/chacha-riscv64-zvkb.S delete mode 100644 arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S delete mode 100644 arch/riscv/lib/crypto/sha256.c create mode 100644 lib/crypto/riscv/Kconfig create mode 100644 lib/crypto/riscv/Makefile create mode 100644 lib/crypto/riscv/chacha-riscv64-glue.c create mode 100644 lib/crypto/riscv/chacha-riscv64-zvkb.S create mode 100644 lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S create mode 100644 lib/crypto/riscv/sha256.c diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 0baec92d2f55..b1c46153606a 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += crypto/ lib-y += delay.o lib-y += memcpy.o lib-y += memset.o diff --git a/arch/riscv/lib/crypto/Kconfig b/arch/riscv/lib/crypto/Kconfig deleted file mode 100644 index 47c99ea97ce2..000000000000 --- a/arch/riscv/lib/crypto/Kconfig +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - -config CRYPTO_SHA256_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/riscv/lib/crypto/Makefile b/arch/riscv/lib/crypto/Makefile deleted file mode 100644 index b7cb877a2c07..000000000000 --- a/arch/riscv/lib/crypto/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o -chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o - -obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o -sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o diff --git a/arch/riscv/lib/crypto/chacha-riscv64-glue.c b/arch/riscv/lib/crypto/chacha-riscv64-glue.c deleted file mode 100644 index 8c3f11d79be3..000000000000 --- a/arch/riscv/lib/crypto/chacha-riscv64-glue.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ChaCha stream cipher (RISC-V optimized) - * - * Copyright (C) 2023 SiFive, Inc. - * Author: Jerry Shih - */ - -#include -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); - -asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, - size_t nblocks, int nrounds); - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 block_buffer[CHACHA_BLOCK_SIZE]; - unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; - unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE; - - if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - kernel_vector_begin(); - if (full_blocks) { - chacha_zvkb(state, src, dst, full_blocks, nrounds); - src += full_blocks * CHACHA_BLOCK_SIZE; - dst += full_blocks * CHACHA_BLOCK_SIZE; - } - if (tail_bytes) { - memcpy(block_buffer, src, tail_bytes); - chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds); - memcpy(dst, block_buffer, tail_bytes); - } - kernel_vector_end(); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&use_zvkb); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init riscv64_chacha_mod_init(void) -{ - if (riscv_isa_extension_available(NULL, ZVKB) && - riscv_vector_vlen() >= 128) - static_branch_enable(&use_zvkb); - return 0; -} -subsys_initcall(riscv64_chacha_mod_init); - -static void __exit riscv64_chacha_mod_exit(void) -{ -} -module_exit(riscv64_chacha_mod_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); -MODULE_AUTHOR("Jerry Shih "); -MODULE_LICENSE("GPL"); diff --git a/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S b/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S deleted file mode 100644 index b777d0b4e379..000000000000 --- a/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S +++ /dev/null @@ -1,297 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ -// -// This file is dual-licensed, meaning that you can use it under your -// choice of either of the following two licenses: -// -// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the Apache License 2.0 (the "License"). You can obtain -// a copy in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html -// -// or -// -// Copyright (c) 2023, Jerry Shih -// Copyright 2024 Google LLC -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// The generated code of this file depends on the following RISC-V extensions: -// - RV64I -// - RISC-V Vector ('V') with VLEN >= 128 -// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') - -#include - -.text -.option arch, +zvkb - -#define STATEP a0 -#define INP a1 -#define OUTP a2 -#define NBLOCKS a3 -#define NROUNDS a4 - -#define CONSTS0 a5 -#define CONSTS1 a6 -#define CONSTS2 a7 -#define CONSTS3 t0 -#define TMP t1 -#define VL t2 -#define STRIDE t3 -#define ROUND_CTR t4 -#define KEY0 s0 -#define KEY1 s1 -#define KEY2 s2 -#define KEY3 s3 -#define KEY4 s4 -#define KEY5 s5 -#define KEY6 s6 -#define KEY7 s7 -#define COUNTER s8 -#define NONCE0 s9 -#define NONCE1 s10 -#define NONCE2 s11 - -.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ - a2, b2, c2, d2, a3, b3, c3, d3 - // a += b; d ^= a; d = rol(d, 16); - vadd.vv \a0, \a0, \b0 - vadd.vv \a1, \a1, \b1 - vadd.vv \a2, \a2, \b2 - vadd.vv \a3, \a3, \b3 - vxor.vv \d0, \d0, \a0 - vxor.vv \d1, \d1, \a1 - vxor.vv \d2, \d2, \a2 - vxor.vv \d3, \d3, \a3 - vror.vi \d0, \d0, 32 - 16 - vror.vi \d1, \d1, 32 - 16 - vror.vi \d2, \d2, 32 - 16 - vror.vi \d3, \d3, 32 - 16 - - // c += d; b ^= c; b = rol(b, 12); - vadd.vv \c0, \c0, \d0 - vadd.vv \c1, \c1, \d1 - vadd.vv \c2, \c2, \d2 - vadd.vv \c3, \c3, \d3 - vxor.vv \b0, \b0, \c0 - vxor.vv \b1, \b1, \c1 - vxor.vv \b2, \b2, \c2 - vxor.vv \b3, \b3, \c3 - vror.vi \b0, \b0, 32 - 12 - vror.vi \b1, \b1, 32 - 12 - vror.vi \b2, \b2, 32 - 12 - vror.vi \b3, \b3, 32 - 12 - - // a += b; d ^= a; d = rol(d, 8); - vadd.vv \a0, \a0, \b0 - vadd.vv \a1, \a1, \b1 - vadd.vv \a2, \a2, \b2 - vadd.vv \a3, \a3, \b3 - vxor.vv \d0, \d0, \a0 - vxor.vv \d1, \d1, \a1 - vxor.vv \d2, \d2, \a2 - vxor.vv \d3, \d3, \a3 - vror.vi \d0, \d0, 32 - 8 - vror.vi \d1, \d1, 32 - 8 - vror.vi \d2, \d2, 32 - 8 - vror.vi \d3, \d3, 32 - 8 - - // c += d; b ^= c; b = rol(b, 7); - vadd.vv \c0, \c0, \d0 - vadd.vv \c1, \c1, \d1 - vadd.vv \c2, \c2, \d2 - vadd.vv \c3, \c3, \d3 - vxor.vv \b0, \b0, \c0 - vxor.vv \b1, \b1, \c1 - vxor.vv \b2, \b2, \c2 - vxor.vv \b3, \b3, \c3 - vror.vi \b0, \b0, 32 - 7 - vror.vi \b1, \b1, 32 - 7 - vror.vi \b2, \b2, 32 - 7 - vror.vi \b3, \b3, 32 - 7 -.endm - -// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, -// size_t nblocks, int nrounds); -// -// |nblocks| is the number of 64-byte blocks to process, and must be nonzero. -// -// |state| gives the ChaCha state matrix, including the 32-bit counter in -// state->x[12] following the RFC7539 convention; note that this differs from -// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13]. -// The updated 32-bit counter is written back to state->x[12] before returning. -SYM_FUNC_START(chacha_zvkb) - addi sp, sp, -96 - sd s0, 0(sp) - sd s1, 8(sp) - sd s2, 16(sp) - sd s3, 24(sp) - sd s4, 32(sp) - sd s5, 40(sp) - sd s6, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - - li STRIDE, 64 - - // Set up the initial state matrix in scalar registers. - lw CONSTS0, 0(STATEP) - lw CONSTS1, 4(STATEP) - lw CONSTS2, 8(STATEP) - lw CONSTS3, 12(STATEP) - lw KEY0, 16(STATEP) - lw KEY1, 20(STATEP) - lw KEY2, 24(STATEP) - lw KEY3, 28(STATEP) - lw KEY4, 32(STATEP) - lw KEY5, 36(STATEP) - lw KEY6, 40(STATEP) - lw KEY7, 44(STATEP) - lw COUNTER, 48(STATEP) - lw NONCE0, 52(STATEP) - lw NONCE1, 56(STATEP) - lw NONCE2, 60(STATEP) - -.Lblock_loop: - // Set vl to the number of blocks to process in this iteration. - vsetvli VL, NBLOCKS, e32, m1, ta, ma - - // Set up the initial state matrix for the next VL blocks in v0-v15. - // v{i} holds the i'th 32-bit word of the state matrix for all blocks. - // Note that only the counter word, at index 12, differs across blocks. - vmv.v.x v0, CONSTS0 - vmv.v.x v1, CONSTS1 - vmv.v.x v2, CONSTS2 - vmv.v.x v3, CONSTS3 - vmv.v.x v4, KEY0 - vmv.v.x v5, KEY1 - vmv.v.x v6, KEY2 - vmv.v.x v7, KEY3 - vmv.v.x v8, KEY4 - vmv.v.x v9, KEY5 - vmv.v.x v10, KEY6 - vmv.v.x v11, KEY7 - vid.v v12 - vadd.vx v12, v12, COUNTER - vmv.v.x v13, NONCE0 - vmv.v.x v14, NONCE1 - vmv.v.x v15, NONCE2 - - // Load the first half of the input data for each block into v16-v23. - // v{16+i} holds the i'th 32-bit word for all blocks. - vlsseg8e32.v v16, (INP), STRIDE - - mv ROUND_CTR, NROUNDS -.Lnext_doubleround: - addi ROUND_CTR, ROUND_CTR, -2 - // column round - chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ - v2, v6, v10, v14, v3, v7, v11, v15 - // diagonal round - chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ - v2, v7, v8, v13, v3, v4, v9, v14 - bnez ROUND_CTR, .Lnext_doubleround - - // Load the second half of the input data for each block into v24-v31. - // v{24+i} holds the {8+i}'th 32-bit word for all blocks. - addi TMP, INP, 32 - vlsseg8e32.v v24, (TMP), STRIDE - - // Finalize the first half of the keystream for each block. - vadd.vx v0, v0, CONSTS0 - vadd.vx v1, v1, CONSTS1 - vadd.vx v2, v2, CONSTS2 - vadd.vx v3, v3, CONSTS3 - vadd.vx v4, v4, KEY0 - vadd.vx v5, v5, KEY1 - vadd.vx v6, v6, KEY2 - vadd.vx v7, v7, KEY3 - - // Encrypt/decrypt the first half of the data for each block. - vxor.vv v16, v16, v0 - vxor.vv v17, v17, v1 - vxor.vv v18, v18, v2 - vxor.vv v19, v19, v3 - vxor.vv v20, v20, v4 - vxor.vv v21, v21, v5 - vxor.vv v22, v22, v6 - vxor.vv v23, v23, v7 - - // Store the first half of the output data for each block. - vssseg8e32.v v16, (OUTP), STRIDE - - // Finalize the second half of the keystream for each block. - vadd.vx v8, v8, KEY4 - vadd.vx v9, v9, KEY5 - vadd.vx v10, v10, KEY6 - vadd.vx v11, v11, KEY7 - vid.v v0 - vadd.vx v12, v12, COUNTER - vadd.vx v13, v13, NONCE0 - vadd.vx v14, v14, NONCE1 - vadd.vx v15, v15, NONCE2 - vadd.vv v12, v12, v0 - - // Encrypt/decrypt the second half of the data for each block. - vxor.vv v24, v24, v8 - vxor.vv v25, v25, v9 - vxor.vv v26, v26, v10 - vxor.vv v27, v27, v11 - vxor.vv v29, v29, v13 - vxor.vv v28, v28, v12 - vxor.vv v30, v30, v14 - vxor.vv v31, v31, v15 - - // Store the second half of the output data for each block. - addi TMP, OUTP, 32 - vssseg8e32.v v24, (TMP), STRIDE - - // Update the counter, the remaining number of blocks, and the input and - // output pointers according to the number of blocks processed (VL). - add COUNTER, COUNTER, VL - sub NBLOCKS, NBLOCKS, VL - slli TMP, VL, 6 - add OUTP, OUTP, TMP - add INP, INP, TMP - bnez NBLOCKS, .Lblock_loop - - sw COUNTER, 48(STATEP) - ld s0, 0(sp) - ld s1, 8(sp) - ld s2, 16(sp) - ld s3, 24(sp) - ld s4, 32(sp) - ld s5, 40(sp) - ld s6, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) - addi sp, sp, 96 - ret -SYM_FUNC_END(chacha_zvkb) diff --git a/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S deleted file mode 100644 index fad501ad0617..000000000000 --- a/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S +++ /dev/null @@ -1,225 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ -// -// This file is dual-licensed, meaning that you can use it under your -// choice of either of the following two licenses: -// -// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the Apache License 2.0 (the "License"). You can obtain -// a copy in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html -// -// or -// -// Copyright (c) 2023, Christoph Müllner -// Copyright (c) 2023, Phoebe Chen -// Copyright 2024 Google LLC -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// The generated code of this file depends on the following RISC-V extensions: -// - RV64I -// - RISC-V Vector ('V') with VLEN >= 128 -// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb') -// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') - -#include - -.text -.option arch, +zvknha, +zvkb - -#define STATEP a0 -#define DATA a1 -#define NUM_BLOCKS a2 - -#define STATEP_C a3 - -#define MASK v0 -#define INDICES v1 -#define W0 v2 -#define W1 v3 -#define W2 v4 -#define W3 v5 -#define VTMP v6 -#define FEBA v7 -#define HGDC v8 -#define K0 v10 -#define K1 v11 -#define K2 v12 -#define K3 v13 -#define K4 v14 -#define K5 v15 -#define K6 v16 -#define K7 v17 -#define K8 v18 -#define K9 v19 -#define K10 v20 -#define K11 v21 -#define K12 v22 -#define K13 v23 -#define K14 v24 -#define K15 v25 -#define PREV_FEBA v26 -#define PREV_HGDC v27 - -// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words. -// -// If not all the message schedule words have been computed yet, then this also -// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 -// message schedule words; this macro computes the group after w3 and writes it -// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, -// w0), so the caller must cycle through the registers accordingly. -.macro sha256_4rounds last, k, w0, w1, w2, w3 - vadd.vv VTMP, \k, \w0 - vsha2cl.vv HGDC, FEBA, VTMP - vsha2ch.vv FEBA, HGDC, VTMP -.if !\last - vmerge.vvm VTMP, \w2, \w1, MASK - vsha2ms.vv \w0, VTMP, \w3 -.endif -.endm - -.macro sha256_16rounds last, k0, k1, k2, k3 - sha256_4rounds \last, \k0, W0, W1, W2, W3 - sha256_4rounds \last, \k1, W1, W2, W3, W0 - sha256_4rounds \last, \k2, W2, W3, W0, W1 - sha256_4rounds \last, \k3, W3, W0, W1, W2 -.endm - -// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS], -// const u8 *data, size_t nblocks); -SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb) - - // Load the round constants into K0-K15. - vsetivli zero, 4, e32, m1, ta, ma - la t0, K256 - vle32.v K0, (t0) - addi t0, t0, 16 - vle32.v K1, (t0) - addi t0, t0, 16 - vle32.v K2, (t0) - addi t0, t0, 16 - vle32.v K3, (t0) - addi t0, t0, 16 - vle32.v K4, (t0) - addi t0, t0, 16 - vle32.v K5, (t0) - addi t0, t0, 16 - vle32.v K6, (t0) - addi t0, t0, 16 - vle32.v K7, (t0) - addi t0, t0, 16 - vle32.v K8, (t0) - addi t0, t0, 16 - vle32.v K9, (t0) - addi t0, t0, 16 - vle32.v K10, (t0) - addi t0, t0, 16 - vle32.v K11, (t0) - addi t0, t0, 16 - vle32.v K12, (t0) - addi t0, t0, 16 - vle32.v K13, (t0) - addi t0, t0, 16 - vle32.v K14, (t0) - addi t0, t0, 16 - vle32.v K15, (t0) - - // Setup mask for the vmerge to replace the first word (idx==0) in - // message scheduling. There are 4 words, so an 8-bit mask suffices. - vsetivli zero, 1, e8, m1, ta, ma - vmv.v.i MASK, 0x01 - - // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we - // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype - // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0}, - // loaded using the 32-bit little endian value 0x00041014. - li t0, 0x00041014 - vsetivli zero, 1, e32, m1, ta, ma - vmv.v.x INDICES, t0 - addi STATEP_C, STATEP, 8 - vsetivli zero, 4, e32, m1, ta, ma - vluxei8.v FEBA, (STATEP), INDICES - vluxei8.v HGDC, (STATEP_C), INDICES - -.Lnext_block: - addi NUM_BLOCKS, NUM_BLOCKS, -1 - - // Save the previous state, as it's needed later. - vmv.v.v PREV_FEBA, FEBA - vmv.v.v PREV_HGDC, HGDC - - // Load the next 512-bit message block and endian-swap each 32-bit word. - vle32.v W0, (DATA) - vrev8.v W0, W0 - addi DATA, DATA, 16 - vle32.v W1, (DATA) - vrev8.v W1, W1 - addi DATA, DATA, 16 - vle32.v W2, (DATA) - vrev8.v W2, W2 - addi DATA, DATA, 16 - vle32.v W3, (DATA) - vrev8.v W3, W3 - addi DATA, DATA, 16 - - // Do the 64 rounds of SHA-256. - sha256_16rounds 0, K0, K1, K2, K3 - sha256_16rounds 0, K4, K5, K6, K7 - sha256_16rounds 0, K8, K9, K10, K11 - sha256_16rounds 1, K12, K13, K14, K15 - - // Add the previous state. - vadd.vv FEBA, FEBA, PREV_FEBA - vadd.vv HGDC, HGDC, PREV_HGDC - - // Repeat if more blocks remain. - bnez NUM_BLOCKS, .Lnext_block - - // Store the new state and return. - vsuxei8.v FEBA, (STATEP), INDICES - vsuxei8.v HGDC, (STATEP_C), INDICES - ret -SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb) - -.section ".rodata" -.p2align 2 -.type K256, @object -K256: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -.size K256, . - K256 diff --git a/arch/riscv/lib/crypto/sha256.c b/arch/riscv/lib/crypto/sha256.c deleted file mode 100644 index 71808397dff4..000000000000 --- a/arch/riscv/lib/crypto/sha256.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 (RISC-V accelerated) - * - * Copyright (C) 2022 VRULL GmbH - * Author: Heiko Stuebner - * - * Copyright (C) 2023 SiFive, Inc. - * Author: Jerry Shih - */ - -#include -#include -#include -#include - -asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb( - u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_extensions)) { - kernel_vector_begin(); - sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); - kernel_vector_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_extensions); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init riscv64_sha256_mod_init(void) -{ - /* Both zvknha and zvknhb provide the SHA-256 instructions. */ - if ((riscv_isa_extension_available(NULL, ZVKNHA) || - riscv_isa_extension_available(NULL, ZVKNHB)) && - riscv_isa_extension_available(NULL, ZVKB) && - riscv_vector_vlen() >= 128) - static_branch_enable(&have_extensions); - return 0; -} -subsys_initcall(riscv64_sha256_mod_init); - -static void __exit riscv64_sha256_mod_exit(void) -{ -} -module_exit(riscv64_sha256_mod_exit); - -MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)"); -MODULE_AUTHOR("Heiko Stuebner "); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index f4f9a70dd089..a2b58ca2df0c 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -202,7 +202,7 @@ if PPC source "lib/crypto/powerpc/Kconfig" endif if RISCV -source "arch/riscv/lib/crypto/Kconfig" +source "lib/crypto/riscv/Kconfig" endif if S390 source "arch/s390/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index f5f1dcec2f89..05b7e29ea0e8 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -111,3 +111,4 @@ obj-$(CONFIG_ARM) += arm/ obj-$(CONFIG_ARM64) += arm64/ obj-$(CONFIG_MIPS) += mips/ obj-$(CONFIG_PPC) += powerpc/ +obj-$(CONFIG_RISCV) += riscv/ diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig new file mode 100644 index 000000000000..47c99ea97ce2 --- /dev/null +++ b/lib/crypto/riscv/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_RISCV64 + tristate + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + +config CRYPTO_SHA256_RISCV64 + tristate + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD + select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile new file mode 100644 index 000000000000..b7cb877a2c07 --- /dev/null +++ b/lib/crypto/riscv/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o +chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o + +obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o +sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c new file mode 100644 index 000000000000..8c3f11d79be3 --- /dev/null +++ b/lib/crypto/riscv/chacha-riscv64-glue.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ChaCha stream cipher (RISC-V optimized) + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); + +asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, + size_t nblocks, int nrounds); + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 block_buffer[CHACHA_BLOCK_SIZE]; + unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; + unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE; + + if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_vector_begin(); + if (full_blocks) { + chacha_zvkb(state, src, dst, full_blocks, nrounds); + src += full_blocks * CHACHA_BLOCK_SIZE; + dst += full_blocks * CHACHA_BLOCK_SIZE; + } + if (tail_bytes) { + memcpy(block_buffer, src, tail_bytes); + chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds); + memcpy(dst, block_buffer, tail_bytes); + } + kernel_vector_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&use_zvkb); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init riscv64_chacha_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&use_zvkb); + return 0; +} +subsys_initcall(riscv64_chacha_mod_init); + +static void __exit riscv64_chacha_mod_exit(void) +{ +} +module_exit(riscv64_chacha_mod_exit); + +MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); +MODULE_AUTHOR("Jerry Shih "); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S new file mode 100644 index 000000000000..b777d0b4e379 --- /dev/null +++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvkb + +#define STATEP a0 +#define INP a1 +#define OUTP a2 +#define NBLOCKS a3 +#define NROUNDS a4 + +#define CONSTS0 a5 +#define CONSTS1 a6 +#define CONSTS2 a7 +#define CONSTS3 t0 +#define TMP t1 +#define VL t2 +#define STRIDE t3 +#define ROUND_CTR t4 +#define KEY0 s0 +#define KEY1 s1 +#define KEY2 s2 +#define KEY3 s3 +#define KEY4 s4 +#define KEY5 s5 +#define KEY6 s6 +#define KEY7 s7 +#define COUNTER s8 +#define NONCE0 s9 +#define NONCE1 s10 +#define NONCE2 s11 + +.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ + a2, b2, c2, d2, a3, b3, c3, d3 + // a += b; d ^= a; d = rol(d, 16); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 16 + vror.vi \d1, \d1, 32 - 16 + vror.vi \d2, \d2, 32 - 16 + vror.vi \d3, \d3, 32 - 16 + + // c += d; b ^= c; b = rol(b, 12); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 12 + vror.vi \b1, \b1, 32 - 12 + vror.vi \b2, \b2, 32 - 12 + vror.vi \b3, \b3, 32 - 12 + + // a += b; d ^= a; d = rol(d, 8); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 8 + vror.vi \d1, \d1, 32 - 8 + vror.vi \d2, \d2, 32 - 8 + vror.vi \d3, \d3, 32 - 8 + + // c += d; b ^= c; b = rol(b, 7); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 7 + vror.vi \b1, \b1, 32 - 7 + vror.vi \b2, \b2, 32 - 7 + vror.vi \b3, \b3, 32 - 7 +.endm + +// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, +// size_t nblocks, int nrounds); +// +// |nblocks| is the number of 64-byte blocks to process, and must be nonzero. +// +// |state| gives the ChaCha state matrix, including the 32-bit counter in +// state->x[12] following the RFC7539 convention; note that this differs from +// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13]. +// The updated 32-bit counter is written back to state->x[12] before returning. +SYM_FUNC_START(chacha_zvkb) + addi sp, sp, -96 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s5, 40(sp) + sd s6, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + + li STRIDE, 64 + + // Set up the initial state matrix in scalar registers. + lw CONSTS0, 0(STATEP) + lw CONSTS1, 4(STATEP) + lw CONSTS2, 8(STATEP) + lw CONSTS3, 12(STATEP) + lw KEY0, 16(STATEP) + lw KEY1, 20(STATEP) + lw KEY2, 24(STATEP) + lw KEY3, 28(STATEP) + lw KEY4, 32(STATEP) + lw KEY5, 36(STATEP) + lw KEY6, 40(STATEP) + lw KEY7, 44(STATEP) + lw COUNTER, 48(STATEP) + lw NONCE0, 52(STATEP) + lw NONCE1, 56(STATEP) + lw NONCE2, 60(STATEP) + +.Lblock_loop: + // Set vl to the number of blocks to process in this iteration. + vsetvli VL, NBLOCKS, e32, m1, ta, ma + + // Set up the initial state matrix for the next VL blocks in v0-v15. + // v{i} holds the i'th 32-bit word of the state matrix for all blocks. + // Note that only the counter word, at index 12, differs across blocks. + vmv.v.x v0, CONSTS0 + vmv.v.x v1, CONSTS1 + vmv.v.x v2, CONSTS2 + vmv.v.x v3, CONSTS3 + vmv.v.x v4, KEY0 + vmv.v.x v5, KEY1 + vmv.v.x v6, KEY2 + vmv.v.x v7, KEY3 + vmv.v.x v8, KEY4 + vmv.v.x v9, KEY5 + vmv.v.x v10, KEY6 + vmv.v.x v11, KEY7 + vid.v v12 + vadd.vx v12, v12, COUNTER + vmv.v.x v13, NONCE0 + vmv.v.x v14, NONCE1 + vmv.v.x v15, NONCE2 + + // Load the first half of the input data for each block into v16-v23. + // v{16+i} holds the i'th 32-bit word for all blocks. + vlsseg8e32.v v16, (INP), STRIDE + + mv ROUND_CTR, NROUNDS +.Lnext_doubleround: + addi ROUND_CTR, ROUND_CTR, -2 + // column round + chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ + v2, v6, v10, v14, v3, v7, v11, v15 + // diagonal round + chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ + v2, v7, v8, v13, v3, v4, v9, v14 + bnez ROUND_CTR, .Lnext_doubleround + + // Load the second half of the input data for each block into v24-v31. + // v{24+i} holds the {8+i}'th 32-bit word for all blocks. + addi TMP, INP, 32 + vlsseg8e32.v v24, (TMP), STRIDE + + // Finalize the first half of the keystream for each block. + vadd.vx v0, v0, CONSTS0 + vadd.vx v1, v1, CONSTS1 + vadd.vx v2, v2, CONSTS2 + vadd.vx v3, v3, CONSTS3 + vadd.vx v4, v4, KEY0 + vadd.vx v5, v5, KEY1 + vadd.vx v6, v6, KEY2 + vadd.vx v7, v7, KEY3 + + // Encrypt/decrypt the first half of the data for each block. + vxor.vv v16, v16, v0 + vxor.vv v17, v17, v1 + vxor.vv v18, v18, v2 + vxor.vv v19, v19, v3 + vxor.vv v20, v20, v4 + vxor.vv v21, v21, v5 + vxor.vv v22, v22, v6 + vxor.vv v23, v23, v7 + + // Store the first half of the output data for each block. + vssseg8e32.v v16, (OUTP), STRIDE + + // Finalize the second half of the keystream for each block. + vadd.vx v8, v8, KEY4 + vadd.vx v9, v9, KEY5 + vadd.vx v10, v10, KEY6 + vadd.vx v11, v11, KEY7 + vid.v v0 + vadd.vx v12, v12, COUNTER + vadd.vx v13, v13, NONCE0 + vadd.vx v14, v14, NONCE1 + vadd.vx v15, v15, NONCE2 + vadd.vv v12, v12, v0 + + // Encrypt/decrypt the second half of the data for each block. + vxor.vv v24, v24, v8 + vxor.vv v25, v25, v9 + vxor.vv v26, v26, v10 + vxor.vv v27, v27, v11 + vxor.vv v29, v29, v13 + vxor.vv v28, v28, v12 + vxor.vv v30, v30, v14 + vxor.vv v31, v31, v15 + + // Store the second half of the output data for each block. + addi TMP, OUTP, 32 + vssseg8e32.v v24, (TMP), STRIDE + + // Update the counter, the remaining number of blocks, and the input and + // output pointers according to the number of blocks processed (VL). + add COUNTER, COUNTER, VL + sub NBLOCKS, NBLOCKS, VL + slli TMP, VL, 6 + add OUTP, OUTP, TMP + add INP, INP, TMP + bnez NBLOCKS, .Lblock_loop + + sw COUNTER, 48(STATEP) + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s5, 40(sp) + ld s6, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + addi sp, sp, 96 + ret +SYM_FUNC_END(chacha_zvkb) diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S new file mode 100644 index 000000000000..fad501ad0617 --- /dev/null +++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvknha, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 + +#define MASK v0 +#define INDICES v1 +#define W0 v2 +#define W1 v3 +#define W2 v4 +#define W3 v5 +#define VTMP v6 +#define FEBA v7 +#define HGDC v8 +#define K0 v10 +#define K1 v11 +#define K2 v12 +#define K3 v13 +#define K4 v14 +#define K5 v15 +#define K6 v16 +#define K7 v17 +#define K8 v18 +#define K9 v19 +#define K10 v20 +#define K11 v21 +#define K12 v22 +#define K13 v23 +#define K14 v24 +#define K15 v25 +#define PREV_FEBA v26 +#define PREV_HGDC v27 + +// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha256_4rounds last, k, w0, w1, w2, w3 + vadd.vv VTMP, \k, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha256_16rounds last, k0, k1, k2, k3 + sha256_4rounds \last, \k0, W0, W1, W2, W3 + sha256_4rounds \last, \k1, W1, W2, W3, W0 + sha256_4rounds \last, \k2, W2, W3, W0, W1 + sha256_4rounds \last, \k3, W3, W0, W1, W2 +.endm + +// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS], +// const u8 *data, size_t nblocks); +SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb) + + // Load the round constants into K0-K15. + vsetivli zero, 4, e32, m1, ta, ma + la t0, K256 + vle32.v K0, (t0) + addi t0, t0, 16 + vle32.v K1, (t0) + addi t0, t0, 16 + vle32.v K2, (t0) + addi t0, t0, 16 + vle32.v K3, (t0) + addi t0, t0, 16 + vle32.v K4, (t0) + addi t0, t0, 16 + vle32.v K5, (t0) + addi t0, t0, 16 + vle32.v K6, (t0) + addi t0, t0, 16 + vle32.v K7, (t0) + addi t0, t0, 16 + vle32.v K8, (t0) + addi t0, t0, 16 + vle32.v K9, (t0) + addi t0, t0, 16 + vle32.v K10, (t0) + addi t0, t0, 16 + vle32.v K11, (t0) + addi t0, t0, 16 + vle32.v K12, (t0) + addi t0, t0, 16 + vle32.v K13, (t0) + addi t0, t0, 16 + vle32.v K14, (t0) + addi t0, t0, 16 + vle32.v K15, (t0) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype + // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0}, + // loaded using the 32-bit little endian value 0x00041014. + li t0, 0x00041014 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 8 + vsetivli zero, 4, e32, m1, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 512-bit message block and endian-swap each 32-bit word. + vle32.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 16 + vle32.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 16 + vle32.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 16 + vle32.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 16 + + // Do the 64 rounds of SHA-256. + sha256_16rounds 0, K0, K1, K2, K3 + sha256_16rounds 0, K4, K5, K6, K7 + sha256_16rounds 0, K8, K9, K10, K11 + sha256_16rounds 1, K12, K13, K14, K15 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb) + +.section ".rodata" +.p2align 2 +.type K256, @object +K256: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.size K256, . - K256 diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c new file mode 100644 index 000000000000..71808397dff4 --- /dev/null +++ b/lib/crypto/riscv/sha256.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 (RISC-V accelerated) + * + * Copyright (C) 2022 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include + +asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb( + u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); + +void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_extensions)) { + kernel_vector_begin(); + sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); + kernel_vector_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} +EXPORT_SYMBOL_GPL(sha256_blocks_simd); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_extensions); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init riscv64_sha256_mod_init(void) +{ + /* Both zvknha and zvknhb provide the SHA-256 instructions. */ + if ((riscv_isa_extension_available(NULL, ZVKNHA) || + riscv_isa_extension_available(NULL, ZVKNHB)) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&have_extensions); + return 0; +} +subsys_initcall(riscv64_sha256_mod_init); + +static void __exit riscv64_sha256_mod_exit(void) +{ +} +module_exit(riscv64_sha256_mod_exit); + +MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From b8456f7aaf35bc5af247bc8f58412c2cffc331c9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:05 -0700 Subject: lib/crypto: s390: Move arch/s390/lib/crypto/ into lib/crypto/ Move the contents of arch/s390/lib/crypto/ into lib/crypto/s390/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/s390/lib/Makefile | 1 - arch/s390/lib/crypto/Kconfig | 13 - arch/s390/lib/crypto/Makefile | 7 - arch/s390/lib/crypto/chacha-glue.c | 56 --- arch/s390/lib/crypto/chacha-s390.S | 908 ------------------------------------- arch/s390/lib/crypto/chacha-s390.h | 14 - arch/s390/lib/crypto/sha256.c | 47 -- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/s390/Kconfig | 13 + lib/crypto/s390/Makefile | 7 + lib/crypto/s390/chacha-glue.c | 56 +++ lib/crypto/s390/chacha-s390.S | 908 +++++++++++++++++++++++++++++++++++++ lib/crypto/s390/chacha-s390.h | 14 + lib/crypto/s390/sha256.c | 47 ++ 15 files changed, 1047 insertions(+), 1047 deletions(-) delete mode 100644 arch/s390/lib/crypto/Kconfig delete mode 100644 arch/s390/lib/crypto/Makefile delete mode 100644 arch/s390/lib/crypto/chacha-glue.c delete mode 100644 arch/s390/lib/crypto/chacha-s390.S delete mode 100644 arch/s390/lib/crypto/chacha-s390.h delete mode 100644 arch/s390/lib/crypto/sha256.c create mode 100644 lib/crypto/s390/Kconfig create mode 100644 lib/crypto/s390/Makefile create mode 100644 lib/crypto/s390/chacha-glue.c create mode 100644 lib/crypto/s390/chacha-s390.S create mode 100644 lib/crypto/s390/chacha-s390.h create mode 100644 lib/crypto/s390/sha256.c diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index cd35cdbfa871..271a1c407121 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,7 +3,6 @@ # Makefile for s390-specific library files.. # -obj-y += crypto/ lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o lib-y += csum-partial.o obj-y += mem.o xor.o diff --git a/arch/s390/lib/crypto/Kconfig b/arch/s390/lib/crypto/Kconfig deleted file mode 100644 index e3f855ef4393..000000000000 --- a/arch/s390/lib/crypto/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_S390 - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_SHA256_S390 - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile deleted file mode 100644 index 5df30f1e7930..000000000000 --- a/arch/s390/lib/crypto/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o -chacha_s390-y := chacha-glue.o chacha-s390.o - -obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o -sha256-s390-y := sha256.o diff --git a/arch/s390/lib/crypto/chacha-glue.c b/arch/s390/lib/crypto/chacha-glue.c deleted file mode 100644 index f95ba3483bbc..000000000000 --- a/arch/s390/lib/crypto/chacha-glue.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha stream cipher (s390 optimized) - * - * Copyright IBM Corp. 2021 - */ - -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include -#include -#include "chacha-s390.h" - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - /* s390 chacha20 implementation has 20 rounds hard-coded, - * it cannot handle a block of data or less, but otherwise - * it can handle data of arbitrary size - */ - if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { - chacha_crypt_generic(state, dst, src, bytes, nrounds); - } else { - DECLARE_KERNEL_FPU_ONSTACK32(vxstate); - - kernel_fpu_begin(&vxstate, KERNEL_VXR); - chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); - kernel_fpu_end(&vxstate, KERNEL_VXR); - - state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / - CHACHA_BLOCK_SIZE; - } -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return cpu_has_vx(); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/s390/lib/crypto/chacha-s390.S b/arch/s390/lib/crypto/chacha-s390.S deleted file mode 100644 index 63f3102678c0..000000000000 --- a/arch/s390/lib/crypto/chacha-s390.S +++ /dev/null @@ -1,908 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Original implementation written by Andy Polyakov, @dot-asm. - * This is an adaptation of the original code for kernel use. - * - * Copyright (C) 2006-2019 CRYPTOGAMS by . All Rights Reserved. - */ - -#include -#include -#include - -#define SP %r15 -#define FRAME (16 * 8 + 4 * 8) - - .data - .balign 32 - -SYM_DATA_START_LOCAL(sigma) - .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral - .long 1,0,0,0 - .long 2,0,0,0 - .long 3,0,0,0 - .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap - - .long 0,1,2,3 - .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma - .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e - .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 - .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 -SYM_DATA_END(sigma) - - .previous - - GEN_BR_THUNK %r14 - - .text - -############################################################################# -# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, -# counst u32 *key, const u32 *counter) - -#define OUT %r2 -#define INP %r3 -#define LEN %r4 -#define KEY %r5 -#define COUNTER %r6 - -#define BEPERM %v31 -#define CTR %v26 - -#define K0 %v16 -#define K1 %v17 -#define K2 %v18 -#define K3 %v19 - -#define XA0 %v0 -#define XA1 %v1 -#define XA2 %v2 -#define XA3 %v3 - -#define XB0 %v4 -#define XB1 %v5 -#define XB2 %v6 -#define XB3 %v7 - -#define XC0 %v8 -#define XC1 %v9 -#define XC2 %v10 -#define XC3 %v11 - -#define XD0 %v12 -#define XD1 %v13 -#define XD2 %v14 -#define XD3 %v15 - -#define XT0 %v27 -#define XT1 %v28 -#define XT2 %v29 -#define XT3 %v30 - -SYM_FUNC_START(chacha20_vx_4x) - stmg %r6,%r7,6*8(SP) - - larl %r7,sigma - lhi %r0,10 - lhi %r1,0 - - VL K0,0,,%r7 # load sigma - VL K1,0,,KEY # load key - VL K2,16,,KEY - VL K3,0,,COUNTER # load counter - - VL BEPERM,0x40,,%r7 - VL CTR,0x50,,%r7 - - VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma - - VREPF XB0,K1,0 # smash the key - VREPF XB1,K1,1 - VREPF XB2,K1,2 - VREPF XB3,K1,3 - - VREPF XD0,K3,0 - VREPF XD1,K3,1 - VREPF XD2,K3,2 - VREPF XD3,K3,3 - VAF XD0,XD0,CTR - - VREPF XC0,K2,0 - VREPF XC1,K2,1 - VREPF XC2,K2,2 - VREPF XC3,K2,3 - -.Loop_4x: - VAF XA0,XA0,XB0 - VX XD0,XD0,XA0 - VERLLF XD0,XD0,16 - - VAF XA1,XA1,XB1 - VX XD1,XD1,XA1 - VERLLF XD1,XD1,16 - - VAF XA2,XA2,XB2 - VX XD2,XD2,XA2 - VERLLF XD2,XD2,16 - - VAF XA3,XA3,XB3 - VX XD3,XD3,XA3 - VERLLF XD3,XD3,16 - - VAF XC0,XC0,XD0 - VX XB0,XB0,XC0 - VERLLF XB0,XB0,12 - - VAF XC1,XC1,XD1 - VX XB1,XB1,XC1 - VERLLF XB1,XB1,12 - - VAF XC2,XC2,XD2 - VX XB2,XB2,XC2 - VERLLF XB2,XB2,12 - - VAF XC3,XC3,XD3 - VX XB3,XB3,XC3 - VERLLF XB3,XB3,12 - - VAF XA0,XA0,XB0 - VX XD0,XD0,XA0 - VERLLF XD0,XD0,8 - - VAF XA1,XA1,XB1 - VX XD1,XD1,XA1 - VERLLF XD1,XD1,8 - - VAF XA2,XA2,XB2 - VX XD2,XD2,XA2 - VERLLF XD2,XD2,8 - - VAF XA3,XA3,XB3 - VX XD3,XD3,XA3 - VERLLF XD3,XD3,8 - - VAF XC0,XC0,XD0 - VX XB0,XB0,XC0 - VERLLF XB0,XB0,7 - - VAF XC1,XC1,XD1 - VX XB1,XB1,XC1 - VERLLF XB1,XB1,7 - - VAF XC2,XC2,XD2 - VX XB2,XB2,XC2 - VERLLF XB2,XB2,7 - - VAF XC3,XC3,XD3 - VX XB3,XB3,XC3 - VERLLF XB3,XB3,7 - - VAF XA0,XA0,XB1 - VX XD3,XD3,XA0 - VERLLF XD3,XD3,16 - - VAF XA1,XA1,XB2 - VX XD0,XD0,XA1 - VERLLF XD0,XD0,16 - - VAF XA2,XA2,XB3 - VX XD1,XD1,XA2 - VERLLF XD1,XD1,16 - - VAF XA3,XA3,XB0 - VX XD2,XD2,XA3 - VERLLF XD2,XD2,16 - - VAF XC2,XC2,XD3 - VX XB1,XB1,XC2 - VERLLF XB1,XB1,12 - - VAF XC3,XC3,XD0 - VX XB2,XB2,XC3 - VERLLF XB2,XB2,12 - - VAF XC0,XC0,XD1 - VX XB3,XB3,XC0 - VERLLF XB3,XB3,12 - - VAF XC1,XC1,XD2 - VX XB0,XB0,XC1 - VERLLF XB0,XB0,12 - - VAF XA0,XA0,XB1 - VX XD3,XD3,XA0 - VERLLF XD3,XD3,8 - - VAF XA1,XA1,XB2 - VX XD0,XD0,XA1 - VERLLF XD0,XD0,8 - - VAF XA2,XA2,XB3 - VX XD1,XD1,XA2 - VERLLF XD1,XD1,8 - - VAF XA3,XA3,XB0 - VX XD2,XD2,XA3 - VERLLF XD2,XD2,8 - - VAF XC2,XC2,XD3 - VX XB1,XB1,XC2 - VERLLF XB1,XB1,7 - - VAF XC3,XC3,XD0 - VX XB2,XB2,XC3 - VERLLF XB2,XB2,7 - - VAF XC0,XC0,XD1 - VX XB3,XB3,XC0 - VERLLF XB3,XB3,7 - - VAF XC1,XC1,XD2 - VX XB0,XB0,XC1 - VERLLF XB0,XB0,7 - brct %r0,.Loop_4x - - VAF XD0,XD0,CTR - - VMRHF XT0,XA0,XA1 # transpose data - VMRHF XT1,XA2,XA3 - VMRLF XT2,XA0,XA1 - VMRLF XT3,XA2,XA3 - VPDI XA0,XT0,XT1,0b0000 - VPDI XA1,XT0,XT1,0b0101 - VPDI XA2,XT2,XT3,0b0000 - VPDI XA3,XT2,XT3,0b0101 - - VMRHF XT0,XB0,XB1 - VMRHF XT1,XB2,XB3 - VMRLF XT2,XB0,XB1 - VMRLF XT3,XB2,XB3 - VPDI XB0,XT0,XT1,0b0000 - VPDI XB1,XT0,XT1,0b0101 - VPDI XB2,XT2,XT3,0b0000 - VPDI XB3,XT2,XT3,0b0101 - - VMRHF XT0,XC0,XC1 - VMRHF XT1,XC2,XC3 - VMRLF XT2,XC0,XC1 - VMRLF XT3,XC2,XC3 - VPDI XC0,XT0,XT1,0b0000 - VPDI XC1,XT0,XT1,0b0101 - VPDI XC2,XT2,XT3,0b0000 - VPDI XC3,XT2,XT3,0b0101 - - VMRHF XT0,XD0,XD1 - VMRHF XT1,XD2,XD3 - VMRLF XT2,XD0,XD1 - VMRLF XT3,XD2,XD3 - VPDI XD0,XT0,XT1,0b0000 - VPDI XD1,XT0,XT1,0b0101 - VPDI XD2,XT2,XT3,0b0000 - VPDI XD3,XT2,XT3,0b0101 - - VAF XA0,XA0,K0 - VAF XB0,XB0,K1 - VAF XC0,XC0,K2 - VAF XD0,XD0,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - - VAF XA0,XA1,K0 - VAF XB0,XB1,K1 - VAF XC0,XC1,K2 - VAF XD0,XD1,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_4x - - VAF XA0,XA2,K0 - VAF XB0,XB2,K1 - VAF XC0,XC2,K2 - VAF XD0,XD2,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_4x - - VAF XA0,XA3,K0 - VAF XB0,XB3,K1 - VAF XC0,XC3,K2 - VAF XD0,XD3,K3 - - VPERM XA0,XA0,XA0,BEPERM - VPERM XB0,XB0,XB0,BEPERM - VPERM XC0,XC0,XC0,BEPERM - VPERM XD0,XD0,XD0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_4x - - VLM XT0,XT3,0,INP,0 - - VX XT0,XT0,XA0 - VX XT1,XT1,XB0 - VX XT2,XT2,XC0 - VX XT3,XT3,XD0 - - VSTM XT0,XT3,0,OUT,0 - -.Ldone_4x: - lmg %r6,%r7,6*8(SP) - BR_EX %r14 - -.Ltail_4x: - VLR XT0,XC0 - VLR XT1,XD0 - - VST XA0,8*8+0x00,,SP - VST XB0,8*8+0x10,,SP - VST XT0,8*8+0x20,,SP - VST XT1,8*8+0x30,,SP - - lghi %r1,0 - -.Loop_tail_4x: - llgc %r5,0(%r1,INP) - llgc %r6,8*8(%r1,SP) - xr %r6,%r5 - stc %r6,0(%r1,OUT) - la %r1,1(%r1) - brct LEN,.Loop_tail_4x - - lmg %r6,%r7,6*8(SP) - BR_EX %r14 -SYM_FUNC_END(chacha20_vx_4x) - -#undef OUT -#undef INP -#undef LEN -#undef KEY -#undef COUNTER - -#undef BEPERM - -#undef K0 -#undef K1 -#undef K2 -#undef K3 - - -############################################################################# -# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, -# counst u32 *key, const u32 *counter) - -#define OUT %r2 -#define INP %r3 -#define LEN %r4 -#define KEY %r5 -#define COUNTER %r6 - -#define BEPERM %v31 - -#define K0 %v27 -#define K1 %v24 -#define K2 %v25 -#define K3 %v26 - -#define A0 %v0 -#define B0 %v1 -#define C0 %v2 -#define D0 %v3 - -#define A1 %v4 -#define B1 %v5 -#define C1 %v6 -#define D1 %v7 - -#define A2 %v8 -#define B2 %v9 -#define C2 %v10 -#define D2 %v11 - -#define A3 %v12 -#define B3 %v13 -#define C3 %v14 -#define D3 %v15 - -#define A4 %v16 -#define B4 %v17 -#define C4 %v18 -#define D4 %v19 - -#define A5 %v20 -#define B5 %v21 -#define C5 %v22 -#define D5 %v23 - -#define T0 %v27 -#define T1 %v28 -#define T2 %v29 -#define T3 %v30 - -SYM_FUNC_START(chacha20_vx) - clgfi LEN,256 - jle chacha20_vx_4x - stmg %r6,%r7,6*8(SP) - - lghi %r1,-FRAME - lgr %r0,SP - la SP,0(%r1,SP) - stg %r0,0(SP) # back-chain - - larl %r7,sigma - lhi %r0,10 - - VLM K1,K2,0,KEY,0 # load key - VL K3,0,,COUNTER # load counter - - VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... - -.Loop_outer_vx: - VLR A0,K0 - VLR B0,K1 - VLR A1,K0 - VLR B1,K1 - VLR A2,K0 - VLR B2,K1 - VLR A3,K0 - VLR B3,K1 - VLR A4,K0 - VLR B4,K1 - VLR A5,K0 - VLR B5,K1 - - VLR D0,K3 - VAF D1,K3,T1 # K[3]+1 - VAF D2,K3,T2 # K[3]+2 - VAF D3,K3,T3 # K[3]+3 - VAF D4,D2,T2 # K[3]+4 - VAF D5,D2,T3 # K[3]+5 - - VLR C0,K2 - VLR C1,K2 - VLR C2,K2 - VLR C3,K2 - VLR C4,K2 - VLR C5,K2 - - VLR T1,D1 - VLR T2,D2 - VLR T3,D3 - -.Loop_vx: - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,16 - VERLLF D1,D1,16 - VERLLF D2,D2,16 - VERLLF D3,D3,16 - VERLLF D4,D4,16 - VERLLF D5,D5,16 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,12 - VERLLF B1,B1,12 - VERLLF B2,B2,12 - VERLLF B3,B3,12 - VERLLF B4,B4,12 - VERLLF B5,B5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,8 - VERLLF D1,D1,8 - VERLLF D2,D2,8 - VERLLF D3,D3,8 - VERLLF D4,D4,8 - VERLLF D5,D5,8 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,7 - VERLLF B1,B1,7 - VERLLF B2,B2,7 - VERLLF B3,B3,7 - VERLLF B4,B4,7 - VERLLF B5,B5,7 - - VSLDB C0,C0,C0,8 - VSLDB C1,C1,C1,8 - VSLDB C2,C2,C2,8 - VSLDB C3,C3,C3,8 - VSLDB C4,C4,C4,8 - VSLDB C5,C5,C5,8 - VSLDB B0,B0,B0,4 - VSLDB B1,B1,B1,4 - VSLDB B2,B2,B2,4 - VSLDB B3,B3,B3,4 - VSLDB B4,B4,B4,4 - VSLDB B5,B5,B5,4 - VSLDB D0,D0,D0,12 - VSLDB D1,D1,D1,12 - VSLDB D2,D2,D2,12 - VSLDB D3,D3,D3,12 - VSLDB D4,D4,D4,12 - VSLDB D5,D5,D5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,16 - VERLLF D1,D1,16 - VERLLF D2,D2,16 - VERLLF D3,D3,16 - VERLLF D4,D4,16 - VERLLF D5,D5,16 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,12 - VERLLF B1,B1,12 - VERLLF B2,B2,12 - VERLLF B3,B3,12 - VERLLF B4,B4,12 - VERLLF B5,B5,12 - - VAF A0,A0,B0 - VAF A1,A1,B1 - VAF A2,A2,B2 - VAF A3,A3,B3 - VAF A4,A4,B4 - VAF A5,A5,B5 - VX D0,D0,A0 - VX D1,D1,A1 - VX D2,D2,A2 - VX D3,D3,A3 - VX D4,D4,A4 - VX D5,D5,A5 - VERLLF D0,D0,8 - VERLLF D1,D1,8 - VERLLF D2,D2,8 - VERLLF D3,D3,8 - VERLLF D4,D4,8 - VERLLF D5,D5,8 - - VAF C0,C0,D0 - VAF C1,C1,D1 - VAF C2,C2,D2 - VAF C3,C3,D3 - VAF C4,C4,D4 - VAF C5,C5,D5 - VX B0,B0,C0 - VX B1,B1,C1 - VX B2,B2,C2 - VX B3,B3,C3 - VX B4,B4,C4 - VX B5,B5,C5 - VERLLF B0,B0,7 - VERLLF B1,B1,7 - VERLLF B2,B2,7 - VERLLF B3,B3,7 - VERLLF B4,B4,7 - VERLLF B5,B5,7 - - VSLDB C0,C0,C0,8 - VSLDB C1,C1,C1,8 - VSLDB C2,C2,C2,8 - VSLDB C3,C3,C3,8 - VSLDB C4,C4,C4,8 - VSLDB C5,C5,C5,8 - VSLDB B0,B0,B0,12 - VSLDB B1,B1,B1,12 - VSLDB B2,B2,B2,12 - VSLDB B3,B3,B3,12 - VSLDB B4,B4,B4,12 - VSLDB B5,B5,B5,12 - VSLDB D0,D0,D0,4 - VSLDB D1,D1,D1,4 - VSLDB D2,D2,D2,4 - VSLDB D3,D3,D3,4 - VSLDB D4,D4,D4,4 - VSLDB D5,D5,D5,4 - brct %r0,.Loop_vx - - VAF A0,A0,K0 - VAF B0,B0,K1 - VAF C0,C0,K2 - VAF D0,D0,K3 - VAF A1,A1,K0 - VAF D1,D1,T1 # +K[3]+1 - - VPERM A0,A0,A0,BEPERM - VPERM B0,B0,B0,BEPERM - VPERM C0,C0,C0,BEPERM - VPERM D0,D0,D0,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VAF D2,D2,T2 # +K[3]+2 - VAF D3,D3,T3 # +K[3]+3 - VLM T0,T3,0,INP,0 - - VX A0,A0,T0 - VX B0,B0,T1 - VX C0,C0,T2 - VX D0,D0,T3 - - VLM K0,T3,0,%r7,4 # re-load sigma and increments - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF B1,B1,K1 - VAF C1,C1,K2 - - VPERM A0,A1,A1,BEPERM - VPERM B0,B1,B1,BEPERM - VPERM C0,C1,C1,BEPERM - VPERM D0,D1,D1,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A2,A2,K0 - VAF B2,B2,K1 - VAF C2,C2,K2 - - VPERM A0,A2,A2,BEPERM - VPERM B0,B2,B2,BEPERM - VPERM C0,C2,C2,BEPERM - VPERM D0,D2,D2,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A3,A3,K0 - VAF B3,B3,K1 - VAF C3,C3,K2 - VAF D2,K3,T3 # K[3]+3 - - VPERM A0,A3,A3,BEPERM - VPERM B0,B3,B3,BEPERM - VPERM C0,C3,C3,BEPERM - VPERM D0,D3,D3,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VAF D3,D2,T1 # K[3]+4 - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A4,A4,K0 - VAF B4,B4,K1 - VAF C4,C4,K2 - VAF D4,D4,D3 # +K[3]+4 - VAF D3,D3,T1 # K[3]+5 - VAF K3,D2,T3 # K[3]+=6 - - VPERM A0,A4,A4,BEPERM - VPERM B0,B4,B4,BEPERM - VPERM C0,C4,C4,BEPERM - VPERM D0,D4,D4,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - aghi LEN,-0x40 - je .Ldone_vx - - VAF A5,A5,K0 - VAF B5,B5,K1 - VAF C5,C5,K2 - VAF D5,D5,D3 # +K[3]+5 - - VPERM A0,A5,A5,BEPERM - VPERM B0,B5,B5,BEPERM - VPERM C0,C5,C5,BEPERM - VPERM D0,D5,D5,BEPERM - - clgfi LEN,0x40 - jl .Ltail_vx - - VLM A1,D1,0,INP,0 - - VX A0,A0,A1 - VX B0,B0,B1 - VX C0,C0,C1 - VX D0,D0,D1 - - VSTM A0,D0,0,OUT,0 - - la INP,0x40(INP) - la OUT,0x40(OUT) - lhi %r0,10 - aghi LEN,-0x40 - jne .Loop_outer_vx - -.Ldone_vx: - lmg %r6,%r7,FRAME+6*8(SP) - la SP,FRAME(SP) - BR_EX %r14 - -.Ltail_vx: - VSTM A0,D0,8*8,SP,3 - lghi %r1,0 - -.Loop_tail_vx: - llgc %r5,0(%r1,INP) - llgc %r6,8*8(%r1,SP) - xr %r6,%r5 - stc %r6,0(%r1,OUT) - la %r1,1(%r1) - brct LEN,.Loop_tail_vx - - lmg %r6,%r7,FRAME+6*8(SP) - la SP,FRAME(SP) - BR_EX %r14 -SYM_FUNC_END(chacha20_vx) - -.previous diff --git a/arch/s390/lib/crypto/chacha-s390.h b/arch/s390/lib/crypto/chacha-s390.h deleted file mode 100644 index 733744ce30f5..000000000000 --- a/arch/s390/lib/crypto/chacha-s390.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 ChaCha stream cipher. - * - * Copyright IBM Corp. 2021 - */ - -#ifndef _CHACHA_S390_H -#define _CHACHA_S390_H - -void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, - const u32 *counter); - -#endif /* _CHACHA_S390_H */ diff --git a/arch/s390/lib/crypto/sha256.c b/arch/s390/lib/crypto/sha256.c deleted file mode 100644 index 7dfe120fafab..000000000000 --- a/arch/s390/lib/crypto/sha256.c +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_cpacf_sha256)) - cpacf_kimd(CPACF_KIMD_SHA_256, state, data, - nblocks * SHA256_BLOCK_SIZE); - else - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_cpacf_sha256); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_s390_mod_init(void) -{ - if (cpu_have_feature(S390_CPU_FEATURE_MSA) && - cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - static_branch_enable(&have_cpacf_sha256); - return 0; -} -subsys_initcall(sha256_s390_mod_init); - -static void __exit sha256_s390_mod_exit(void) -{ -} -module_exit(sha256_s390_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index a2b58ca2df0c..278b7ef5ec4f 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -205,7 +205,7 @@ if RISCV source "lib/crypto/riscv/Kconfig" endif if S390 -source "arch/s390/lib/crypto/Kconfig" +source "lib/crypto/s390/Kconfig" endif if SPARC source "arch/sparc/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 05b7e29ea0e8..26f65bb4c8d8 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -112,3 +112,4 @@ obj-$(CONFIG_ARM64) += arm64/ obj-$(CONFIG_MIPS) += mips/ obj-$(CONFIG_PPC) += powerpc/ obj-$(CONFIG_RISCV) += riscv/ +obj-$(CONFIG_S390) += s390/ diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig new file mode 100644 index 000000000000..e3f855ef4393 --- /dev/null +++ b/lib/crypto/s390/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_S390 + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_SHA256_S390 + tristate + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile new file mode 100644 index 000000000000..5df30f1e7930 --- /dev/null +++ b/lib/crypto/s390/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o +chacha_s390-y := chacha-glue.o chacha-s390.o + +obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o +sha256-s390-y := sha256.o diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c new file mode 100644 index 000000000000..f95ba3483bbc --- /dev/null +++ b/lib/crypto/s390/chacha-glue.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha stream cipher (s390 optimized) + * + * Copyright IBM Corp. 2021 + */ + +#define KMSG_COMPONENT "chacha_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include "chacha-s390.h" + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + /* TODO: implement hchacha_block_arch() in assembly */ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { + chacha_crypt_generic(state, dst, src, bytes, nrounds); + } else { + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / + CHACHA_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return cpu_has_vx(); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S new file mode 100644 index 000000000000..63f3102678c0 --- /dev/null +++ b/lib/crypto/s390/chacha-s390.S @@ -0,0 +1,908 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Original implementation written by Andy Polyakov, @dot-asm. + * This is an adaptation of the original code for kernel use. + * + * Copyright (C) 2006-2019 CRYPTOGAMS by . All Rights Reserved. + */ + +#include +#include +#include + +#define SP %r15 +#define FRAME (16 * 8 + 4 * 8) + + .data + .balign 32 + +SYM_DATA_START_LOCAL(sigma) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 1,0,0,0 + .long 2,0,0,0 + .long 3,0,0,0 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap + + .long 0,1,2,3 + .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 +SYM_DATA_END(sigma) + + .previous + + GEN_BR_THUNK %r14 + + .text + +############################################################################# +# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 +#define CTR %v26 + +#define K0 %v16 +#define K1 %v17 +#define K2 %v18 +#define K3 %v19 + +#define XA0 %v0 +#define XA1 %v1 +#define XA2 %v2 +#define XA3 %v3 + +#define XB0 %v4 +#define XB1 %v5 +#define XB2 %v6 +#define XB3 %v7 + +#define XC0 %v8 +#define XC1 %v9 +#define XC2 %v10 +#define XC3 %v11 + +#define XD0 %v12 +#define XD1 %v13 +#define XD2 %v14 +#define XD3 %v15 + +#define XT0 %v27 +#define XT1 %v28 +#define XT2 %v29 +#define XT3 %v30 + +SYM_FUNC_START(chacha20_vx_4x) + stmg %r6,%r7,6*8(SP) + + larl %r7,sigma + lhi %r0,10 + lhi %r1,0 + + VL K0,0,,%r7 # load sigma + VL K1,0,,KEY # load key + VL K2,16,,KEY + VL K3,0,,COUNTER # load counter + + VL BEPERM,0x40,,%r7 + VL CTR,0x50,,%r7 + + VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma + + VREPF XB0,K1,0 # smash the key + VREPF XB1,K1,1 + VREPF XB2,K1,2 + VREPF XB3,K1,3 + + VREPF XD0,K3,0 + VREPF XD1,K3,1 + VREPF XD2,K3,2 + VREPF XD3,K3,3 + VAF XD0,XD0,CTR + + VREPF XC0,K2,0 + VREPF XC1,K2,1 + VREPF XC2,K2,2 + VREPF XC3,K2,3 + +.Loop_4x: + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,16 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,16 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,16 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,16 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,12 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,12 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,12 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,12 + + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,8 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,8 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,8 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,8 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,7 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,7 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,7 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,7 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,16 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,16 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,16 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,16 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,12 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,12 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,12 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,12 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,8 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,8 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,8 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,8 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,7 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,7 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,7 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,7 + brct %r0,.Loop_4x + + VAF XD0,XD0,CTR + + VMRHF XT0,XA0,XA1 # transpose data + VMRHF XT1,XA2,XA3 + VMRLF XT2,XA0,XA1 + VMRLF XT3,XA2,XA3 + VPDI XA0,XT0,XT1,0b0000 + VPDI XA1,XT0,XT1,0b0101 + VPDI XA2,XT2,XT3,0b0000 + VPDI XA3,XT2,XT3,0b0101 + + VMRHF XT0,XB0,XB1 + VMRHF XT1,XB2,XB3 + VMRLF XT2,XB0,XB1 + VMRLF XT3,XB2,XB3 + VPDI XB0,XT0,XT1,0b0000 + VPDI XB1,XT0,XT1,0b0101 + VPDI XB2,XT2,XT3,0b0000 + VPDI XB3,XT2,XT3,0b0101 + + VMRHF XT0,XC0,XC1 + VMRHF XT1,XC2,XC3 + VMRLF XT2,XC0,XC1 + VMRLF XT3,XC2,XC3 + VPDI XC0,XT0,XT1,0b0000 + VPDI XC1,XT0,XT1,0b0101 + VPDI XC2,XT2,XT3,0b0000 + VPDI XC3,XT2,XT3,0b0101 + + VMRHF XT0,XD0,XD1 + VMRHF XT1,XD2,XD3 + VMRLF XT2,XD0,XD1 + VMRLF XT3,XD2,XD3 + VPDI XD0,XT0,XT1,0b0000 + VPDI XD1,XT0,XT1,0b0101 + VPDI XD2,XT2,XT3,0b0000 + VPDI XD3,XT2,XT3,0b0101 + + VAF XA0,XA0,K0 + VAF XB0,XB0,K1 + VAF XC0,XC0,K2 + VAF XD0,XD0,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + + VAF XA0,XA1,K0 + VAF XB0,XB1,K1 + VAF XC0,XC1,K2 + VAF XD0,XD1,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA2,K0 + VAF XB0,XB2,K1 + VAF XC0,XC2,K2 + VAF XD0,XD2,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA3,K0 + VAF XB0,XB3,K1 + VAF XC0,XC3,K2 + VAF XD0,XD3,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + +.Ldone_4x: + lmg %r6,%r7,6*8(SP) + BR_EX %r14 + +.Ltail_4x: + VLR XT0,XC0 + VLR XT1,XD0 + + VST XA0,8*8+0x00,,SP + VST XB0,8*8+0x10,,SP + VST XT0,8*8+0x20,,SP + VST XT1,8*8+0x30,,SP + + lghi %r1,0 + +.Loop_tail_4x: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_4x + + lmg %r6,%r7,6*8(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx_4x) + +#undef OUT +#undef INP +#undef LEN +#undef KEY +#undef COUNTER + +#undef BEPERM + +#undef K0 +#undef K1 +#undef K2 +#undef K3 + + +############################################################################# +# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 + +#define K0 %v27 +#define K1 %v24 +#define K2 %v25 +#define K3 %v26 + +#define A0 %v0 +#define B0 %v1 +#define C0 %v2 +#define D0 %v3 + +#define A1 %v4 +#define B1 %v5 +#define C1 %v6 +#define D1 %v7 + +#define A2 %v8 +#define B2 %v9 +#define C2 %v10 +#define D2 %v11 + +#define A3 %v12 +#define B3 %v13 +#define C3 %v14 +#define D3 %v15 + +#define A4 %v16 +#define B4 %v17 +#define C4 %v18 +#define D4 %v19 + +#define A5 %v20 +#define B5 %v21 +#define C5 %v22 +#define D5 %v23 + +#define T0 %v27 +#define T1 %v28 +#define T2 %v29 +#define T3 %v30 + +SYM_FUNC_START(chacha20_vx) + clgfi LEN,256 + jle chacha20_vx_4x + stmg %r6,%r7,6*8(SP) + + lghi %r1,-FRAME + lgr %r0,SP + la SP,0(%r1,SP) + stg %r0,0(SP) # back-chain + + larl %r7,sigma + lhi %r0,10 + + VLM K1,K2,0,KEY,0 # load key + VL K3,0,,COUNTER # load counter + + VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... + +.Loop_outer_vx: + VLR A0,K0 + VLR B0,K1 + VLR A1,K0 + VLR B1,K1 + VLR A2,K0 + VLR B2,K1 + VLR A3,K0 + VLR B3,K1 + VLR A4,K0 + VLR B4,K1 + VLR A5,K0 + VLR B5,K1 + + VLR D0,K3 + VAF D1,K3,T1 # K[3]+1 + VAF D2,K3,T2 # K[3]+2 + VAF D3,K3,T3 # K[3]+3 + VAF D4,D2,T2 # K[3]+4 + VAF D5,D2,T3 # K[3]+5 + + VLR C0,K2 + VLR C1,K2 + VLR C2,K2 + VLR C3,K2 + VLR C4,K2 + VLR C5,K2 + + VLR T1,D1 + VLR T2,D2 + VLR T3,D3 + +.Loop_vx: + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,4 + VSLDB B1,B1,B1,4 + VSLDB B2,B2,B2,4 + VSLDB B3,B3,B3,4 + VSLDB B4,B4,B4,4 + VSLDB B5,B5,B5,4 + VSLDB D0,D0,D0,12 + VSLDB D1,D1,D1,12 + VSLDB D2,D2,D2,12 + VSLDB D3,D3,D3,12 + VSLDB D4,D4,D4,12 + VSLDB D5,D5,D5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,12 + VSLDB B1,B1,B1,12 + VSLDB B2,B2,B2,12 + VSLDB B3,B3,B3,12 + VSLDB B4,B4,B4,12 + VSLDB B5,B5,B5,12 + VSLDB D0,D0,D0,4 + VSLDB D1,D1,D1,4 + VSLDB D2,D2,D2,4 + VSLDB D3,D3,D3,4 + VSLDB D4,D4,D4,4 + VSLDB D5,D5,D5,4 + brct %r0,.Loop_vx + + VAF A0,A0,K0 + VAF B0,B0,K1 + VAF C0,C0,K2 + VAF D0,D0,K3 + VAF A1,A1,K0 + VAF D1,D1,T1 # +K[3]+1 + + VPERM A0,A0,A0,BEPERM + VPERM B0,B0,B0,BEPERM + VPERM C0,C0,C0,BEPERM + VPERM D0,D0,D0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D2,D2,T2 # +K[3]+2 + VAF D3,D3,T3 # +K[3]+3 + VLM T0,T3,0,INP,0 + + VX A0,A0,T0 + VX B0,B0,T1 + VX C0,C0,T2 + VX D0,D0,T3 + + VLM K0,T3,0,%r7,4 # re-load sigma and increments + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF B1,B1,K1 + VAF C1,C1,K2 + + VPERM A0,A1,A1,BEPERM + VPERM B0,B1,B1,BEPERM + VPERM C0,C1,C1,BEPERM + VPERM D0,D1,D1,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A2,A2,K0 + VAF B2,B2,K1 + VAF C2,C2,K2 + + VPERM A0,A2,A2,BEPERM + VPERM B0,B2,B2,BEPERM + VPERM C0,C2,C2,BEPERM + VPERM D0,D2,D2,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A3,A3,K0 + VAF B3,B3,K1 + VAF C3,C3,K2 + VAF D2,K3,T3 # K[3]+3 + + VPERM A0,A3,A3,BEPERM + VPERM B0,B3,B3,BEPERM + VPERM C0,C3,C3,BEPERM + VPERM D0,D3,D3,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D3,D2,T1 # K[3]+4 + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A4,A4,K0 + VAF B4,B4,K1 + VAF C4,C4,K2 + VAF D4,D4,D3 # +K[3]+4 + VAF D3,D3,T1 # K[3]+5 + VAF K3,D2,T3 # K[3]+=6 + + VPERM A0,A4,A4,BEPERM + VPERM B0,B4,B4,BEPERM + VPERM C0,C4,C4,BEPERM + VPERM D0,D4,D4,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A5,A5,K0 + VAF B5,B5,K1 + VAF C5,C5,K2 + VAF D5,D5,D3 # +K[3]+5 + + VPERM A0,A5,A5,BEPERM + VPERM B0,B5,B5,BEPERM + VPERM C0,C5,C5,BEPERM + VPERM D0,D5,D5,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + lhi %r0,10 + aghi LEN,-0x40 + jne .Loop_outer_vx + +.Ldone_vx: + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 + +.Ltail_vx: + VSTM A0,D0,8*8,SP,3 + lghi %r1,0 + +.Loop_tail_vx: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_vx + + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx) + +.previous diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h new file mode 100644 index 000000000000..733744ce30f5 --- /dev/null +++ b/lib/crypto/s390/chacha-s390.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#ifndef _CHACHA_S390_H +#define _CHACHA_S390_H + +void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, + const u32 *counter); + +#endif /* _CHACHA_S390_H */ diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c new file mode 100644 index 000000000000..7dfe120fafab --- /dev/null +++ b/lib/crypto/s390/sha256.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha256)) + cpacf_kimd(CPACF_KIMD_SHA_256, state, data, + nblocks * SHA256_BLOCK_SIZE); + else + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_cpacf_sha256); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_s390_mod_init(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) + static_branch_enable(&have_cpacf_sha256); + return 0; +} +subsys_initcall(sha256_s390_mod_init); + +static void __exit sha256_s390_mod_exit(void) +{ +} +module_exit(sha256_s390_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)"); -- cgit v1.2.3 From a32e93e10067d19dec302220a9124a21573e7e7e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:06 -0700 Subject: lib/crypto: sparc: Move arch/sparc/lib/crypto/ into lib/crypto/ Move the contents of arch/sparc/lib/crypto/ into lib/crypto/sparc/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/sparc/lib/Makefile | 1 - arch/sparc/lib/crypto/Kconfig | 8 ---- arch/sparc/lib/crypto/Makefile | 4 -- arch/sparc/lib/crypto/sha256.c | 64 ------------------------------- arch/sparc/lib/crypto/sha256_asm.S | 78 -------------------------------------- lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/sparc/Kconfig | 8 ++++ lib/crypto/sparc/Makefile | 4 ++ lib/crypto/sparc/sha256.c | 64 +++++++++++++++++++++++++++++++ lib/crypto/sparc/sha256_asm.S | 78 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 156 insertions(+), 156 deletions(-) delete mode 100644 arch/sparc/lib/crypto/Kconfig delete mode 100644 arch/sparc/lib/crypto/Makefile delete mode 100644 arch/sparc/lib/crypto/sha256.c delete mode 100644 arch/sparc/lib/crypto/sha256_asm.S create mode 100644 lib/crypto/sparc/Kconfig create mode 100644 lib/crypto/sparc/Makefile create mode 100644 lib/crypto/sparc/sha256.c create mode 100644 lib/crypto/sparc/sha256_asm.S diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 5cf9781d68b4..ef8860eb3f3d 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -4,7 +4,6 @@ asflags-y := -ansi -DST_DIV0=0x02 -obj-y += crypto/ lib-$(CONFIG_SPARC32) += ashrdi3.o lib-$(CONFIG_SPARC32) += memcpy.o memset.o lib-y += strlen.o diff --git a/arch/sparc/lib/crypto/Kconfig b/arch/sparc/lib/crypto/Kconfig deleted file mode 100644 index e5c3e4d3dba6..000000000000 --- a/arch/sparc/lib/crypto/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_SHA256_SPARC64 - tristate - depends on SPARC64 - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/sparc/lib/crypto/Makefile b/arch/sparc/lib/crypto/Makefile deleted file mode 100644 index 75ee244ad6f7..000000000000 --- a/arch/sparc/lib/crypto/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o -sha256-sparc64-y := sha256.o sha256_asm.o diff --git a/arch/sparc/lib/crypto/sha256.c b/arch/sparc/lib/crypto/sha256.c deleted file mode 100644 index 8bdec2db08b3..000000000000 --- a/arch/sparc/lib/crypto/sha256.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * SHA-256 accelerated using the sparc64 sha256 opcodes - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * SHA224 Support Copyright 2007 Intel Corporation - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); - -asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_opcodes)) - sha256_sparc64_transform(state, data, nblocks); - else - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_opcodes); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_sparc64_mod_init(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return 0; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_SHA256)) - return 0; - - static_branch_enable(&have_sha256_opcodes); - pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); - return 0; -} -subsys_initcall(sha256_sparc64_mod_init); - -static void __exit sha256_sparc64_mod_exit(void) -{ -} -module_exit(sha256_sparc64_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes"); diff --git a/arch/sparc/lib/crypto/sha256_asm.S b/arch/sparc/lib/crypto/sha256_asm.S deleted file mode 100644 index ddcdd3daf31e..000000000000 --- a/arch/sparc/lib/crypto/sha256_asm.S +++ /dev/null @@ -1,78 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include - -ENTRY(sha256_sparc64_transform) - /* %o0 = state, %o1 = data, %o2 = nblocks */ - VISEntryHalf - ld [%o0 + 0x00], %f0 - ld [%o0 + 0x04], %f1 - ld [%o0 + 0x08], %f2 - ld [%o0 + 0x0c], %f3 - ld [%o0 + 0x10], %f4 - ld [%o0 + 0x14], %f5 - andcc %o1, 0x7, %g0 - ld [%o0 + 0x18], %f6 - bne,pn %xcc, 10f - ld [%o0 + 0x1c], %f7 - -1: - ldd [%o1 + 0x00], %f8 - ldd [%o1 + 0x08], %f10 - ldd [%o1 + 0x10], %f12 - ldd [%o1 + 0x18], %f14 - ldd [%o1 + 0x20], %f16 - ldd [%o1 + 0x28], %f18 - ldd [%o1 + 0x30], %f20 - ldd [%o1 + 0x38], %f22 - - SHA256 - - subcc %o2, 1, %o2 - bne,pt %xcc, 1b - add %o1, 0x40, %o1 - -5: - st %f0, [%o0 + 0x00] - st %f1, [%o0 + 0x04] - st %f2, [%o0 + 0x08] - st %f3, [%o0 + 0x0c] - st %f4, [%o0 + 0x10] - st %f5, [%o0 + 0x14] - st %f6, [%o0 + 0x18] - st %f7, [%o0 + 0x1c] - retl - VISExitHalf -10: - alignaddr %o1, %g0, %o1 - - ldd [%o1 + 0x00], %f10 -1: - ldd [%o1 + 0x08], %f12 - ldd [%o1 + 0x10], %f14 - ldd [%o1 + 0x18], %f16 - ldd [%o1 + 0x20], %f18 - ldd [%o1 + 0x28], %f20 - ldd [%o1 + 0x30], %f22 - ldd [%o1 + 0x38], %f24 - ldd [%o1 + 0x40], %f26 - - faligndata %f10, %f12, %f8 - faligndata %f12, %f14, %f10 - faligndata %f14, %f16, %f12 - faligndata %f16, %f18, %f14 - faligndata %f18, %f20, %f16 - faligndata %f20, %f22, %f18 - faligndata %f22, %f24, %f20 - faligndata %f24, %f26, %f22 - - SHA256 - - subcc %o2, 1, %o2 - fsrc2 %f26, %f10 - bne,pt %xcc, 1b - add %o1, 0x40, %o1 - - ba,a,pt %xcc, 5b -ENDPROC(sha256_sparc64_transform) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 278b7ef5ec4f..b98543c7ef23 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -208,7 +208,7 @@ if S390 source "lib/crypto/s390/Kconfig" endif if SPARC -source "arch/sparc/lib/crypto/Kconfig" +source "lib/crypto/sparc/Kconfig" endif if X86 source "arch/x86/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 26f65bb4c8d8..7c1e7d06cac9 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -113,3 +113,4 @@ obj-$(CONFIG_MIPS) += mips/ obj-$(CONFIG_PPC) += powerpc/ obj-$(CONFIG_RISCV) += riscv/ obj-$(CONFIG_S390) += s390/ +obj-$(CONFIG_SPARC) += sparc/ diff --git a/lib/crypto/sparc/Kconfig b/lib/crypto/sparc/Kconfig new file mode 100644 index 000000000000..e5c3e4d3dba6 --- /dev/null +++ b/lib/crypto/sparc/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_SHA256_SPARC64 + tristate + depends on SPARC64 + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/sparc/Makefile b/lib/crypto/sparc/Makefile new file mode 100644 index 000000000000..75ee244ad6f7 --- /dev/null +++ b/lib/crypto/sparc/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o +sha256-sparc64-y := sha256.o sha256_asm.o diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c new file mode 100644 index 000000000000..8bdec2db08b3 --- /dev/null +++ b/lib/crypto/sparc/sha256.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SHA-256 accelerated using the sparc64 sha256 opcodes + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2002 James Morris + * SHA224 Support Copyright 2007 Intel Corporation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); + +asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_opcodes)) + sha256_sparc64_transform(state, data, nblocks); + else + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_sha256_opcodes); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_sparc64_mod_init(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return 0; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA256)) + return 0; + + static_branch_enable(&have_sha256_opcodes); + pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); + return 0; +} +subsys_initcall(sha256_sparc64_mod_init); + +static void __exit sha256_sparc64_mod_exit(void) +{ +} +module_exit(sha256_sparc64_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes"); diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S new file mode 100644 index 000000000000..ddcdd3daf31e --- /dev/null +++ b/lib/crypto/sparc/sha256_asm.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +ENTRY(sha256_sparc64_transform) + /* %o0 = state, %o1 = data, %o2 = nblocks */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + ld [%o0 + 0x0c], %f3 + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x18], %f6 + bne,pn %xcc, 10f + ld [%o0 + 0x1c], %f7 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA256 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + st %f5, [%o0 + 0x14] + st %f6, [%o0 + 0x18] + st %f7, [%o0 + 0x1c] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA256 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha256_sparc64_transform) -- cgit v1.2.3 From 74750aa78de33794aa9ab55de15da04bd41d1ac8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:07 -0700 Subject: lib/crypto: x86: Move arch/x86/lib/crypto/ into lib/crypto/ Move the contents of arch/x86/lib/crypto/ into lib/crypto/x86/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/x86/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/lib/.gitignore | 4 + arch/x86/lib/Makefile | 2 - arch/x86/lib/crypto/.gitignore | 2 - arch/x86/lib/crypto/Kconfig | 34 - arch/x86/lib/crypto/Makefile | 20 - arch/x86/lib/crypto/blake2s-core.S | 252 -- arch/x86/lib/crypto/blake2s-glue.c | 70 - arch/x86/lib/crypto/chacha-avx2-x86_64.S | 1021 ----- arch/x86/lib/crypto/chacha-avx512vl-x86_64.S | 836 ---- arch/x86/lib/crypto/chacha-ssse3-x86_64.S | 791 ---- arch/x86/lib/crypto/chacha_glue.c | 196 - arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl | 4253 --------------------- arch/x86/lib/crypto/poly1305_glue.c | 129 - arch/x86/lib/crypto/sha256-avx-asm.S | 499 --- arch/x86/lib/crypto/sha256-avx2-asm.S | 774 ---- arch/x86/lib/crypto/sha256-ni-asm.S | 196 - arch/x86/lib/crypto/sha256-ssse3-asm.S | 511 --- arch/x86/lib/crypto/sha256.c | 80 - lib/crypto/Kconfig | 2 +- lib/crypto/Makefile | 1 + lib/crypto/x86/.gitignore | 2 + lib/crypto/x86/Kconfig | 34 + lib/crypto/x86/Makefile | 20 + lib/crypto/x86/blake2s-core.S | 252 ++ lib/crypto/x86/blake2s-glue.c | 70 + lib/crypto/x86/chacha-avx2-x86_64.S | 1021 +++++ lib/crypto/x86/chacha-avx512vl-x86_64.S | 836 ++++ lib/crypto/x86/chacha-ssse3-x86_64.S | 791 ++++ lib/crypto/x86/chacha_glue.c | 196 + lib/crypto/x86/poly1305-x86_64-cryptogams.pl | 4253 +++++++++++++++++++++ lib/crypto/x86/poly1305_glue.c | 129 + lib/crypto/x86/sha256-avx-asm.S | 499 +++ lib/crypto/x86/sha256-avx2-asm.S | 774 ++++ lib/crypto/x86/sha256-ni-asm.S | 196 + lib/crypto/x86/sha256-ssse3-asm.S | 511 +++ lib/crypto/x86/sha256.c | 80 + 36 files changed, 9670 insertions(+), 9667 deletions(-) delete mode 100644 arch/x86/lib/crypto/.gitignore delete mode 100644 arch/x86/lib/crypto/Kconfig delete mode 100644 arch/x86/lib/crypto/Makefile delete mode 100644 arch/x86/lib/crypto/blake2s-core.S delete mode 100644 arch/x86/lib/crypto/blake2s-glue.c delete mode 100644 arch/x86/lib/crypto/chacha-avx2-x86_64.S delete mode 100644 arch/x86/lib/crypto/chacha-avx512vl-x86_64.S delete mode 100644 arch/x86/lib/crypto/chacha-ssse3-x86_64.S delete mode 100644 arch/x86/lib/crypto/chacha_glue.c delete mode 100644 arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl delete mode 100644 arch/x86/lib/crypto/poly1305_glue.c delete mode 100644 arch/x86/lib/crypto/sha256-avx-asm.S delete mode 100644 arch/x86/lib/crypto/sha256-avx2-asm.S delete mode 100644 arch/x86/lib/crypto/sha256-ni-asm.S delete mode 100644 arch/x86/lib/crypto/sha256-ssse3-asm.S delete mode 100644 arch/x86/lib/crypto/sha256.c create mode 100644 lib/crypto/x86/.gitignore create mode 100644 lib/crypto/x86/Kconfig create mode 100644 lib/crypto/x86/Makefile create mode 100644 lib/crypto/x86/blake2s-core.S create mode 100644 lib/crypto/x86/blake2s-glue.c create mode 100644 lib/crypto/x86/chacha-avx2-x86_64.S create mode 100644 lib/crypto/x86/chacha-avx512vl-x86_64.S create mode 100644 lib/crypto/x86/chacha-ssse3-x86_64.S create mode 100644 lib/crypto/x86/chacha_glue.c create mode 100644 lib/crypto/x86/poly1305-x86_64-cryptogams.pl create mode 100644 lib/crypto/x86/poly1305_glue.c create mode 100644 lib/crypto/x86/sha256-avx-asm.S create mode 100644 lib/crypto/x86/sha256-avx2-asm.S create mode 100644 lib/crypto/x86/sha256-ni-asm.S create mode 100644 lib/crypto/x86/sha256-ssse3-asm.S create mode 100644 lib/crypto/x86/sha256.c diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore index 8ae0f93ecbfd..ec2131c9fd20 100644 --- a/arch/x86/lib/.gitignore +++ b/arch/x86/lib/.gitignore @@ -1,2 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ + inat-tables.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4fa5c4e1ba8a..7cf8681cba0f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for x86 specific library files. # -obj-y += crypto/ - # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore deleted file mode 100644 index 580c839bb177..000000000000 --- a/arch/x86/lib/crypto/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-x86_64-cryptogams.S diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig deleted file mode 100644 index 5e94cdee492c..000000000000 --- a/arch/x86/lib/crypto/Kconfig +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile deleted file mode 100644 index abceca3d31c0..000000000000 --- a/arch/x86/lib/crypto/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o -sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S deleted file mode 100644 index ac1c845445a4..000000000000 --- a/arch/x86/lib/crypto/blake2s-core.S +++ /dev/null @@ -1,252 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2017-2019 Samuel Neves . All Rights Reserved. - */ - -#include - -.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 -.align 32 -IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 - .octa 0x5BE0CD191F83D9AB9B05688C510E527F -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 -.section .rodata.cst16.ROR328, "aM", @progbits, 16 -.align 16 -ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 -.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 -.align 64 -SIGMA: -.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 -.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 -.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 -.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 -.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 -.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 -.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 -.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 -.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 -.align 64 -SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 - -.text -SYM_FUNC_START(blake2s_compress_ssse3) - testq %rdx,%rdx - je .Lendofloop - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqa ROT16(%rip),%xmm12 - movdqa ROR328(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movq %rcx,%xmm15 - leaq SIGMA+0xa0(%rip),%r8 - jmp .Lbeginofloop - .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa IV(%rip),%xmm2 - movdqa %xmm14,%xmm3 - pxor IV+0x10(%rip),%xmm3 - leaq SIGMA(%rip),%rcx -.Lroundloop: - movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - punpckldq %xmm5,%xmm4 - punpckldq %xmm7,%xmm6 - punpcklqdq %xmm6,%xmm4 - paddd %xmm4,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - punpckldq %xmm6,%xmm5 - punpckldq %xmm4,%xmm7 - punpcklqdq %xmm7,%xmm5 - paddd %xmm5,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x93,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - punpckldq %xmm7,%xmm6 - punpckldq %xmm5,%xmm4 - punpcklqdq %xmm4,%xmm6 - paddd %xmm6,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - punpckldq %xmm4,%xmm7 - punpckldq %xmm6,%xmm5 - punpcklqdq %xmm5,%xmm7 - paddd %xmm7,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x39,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx - cmpq %r8,%rcx - jnz .Lroundloop - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm1 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) -.Lendofloop: - RET -SYM_FUNC_END(blake2s_compress_ssse3) - -SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovq %rcx,%xmm5 - vmovdqa IV(%rip),%xmm14 - vmovdqa IV+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi - leaq SIGMA2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 - vpermi2d %ymm7,%ymm6,%ymm8 - vpermi2d %ymm7,%ymm6,%ymm9 - vmovdqa %ymm8,%ymm6 - vmovdqa %ymm9,%ymm7 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x39,%xmm2,%xmm2 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x93,%xmm2,%xmm2 - decb %cl - jne .Lblake2s_compress_avx512_roundloop - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) - vzeroupper - RET -SYM_FUNC_END(blake2s_compress_avx512) diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c deleted file mode 100644 index adc296cd17c9..000000000000 --- a/arch/x86/lib/crypto/blake2s-glue.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S deleted file mode 100644 index f3d8fc018249..000000000000 --- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S +++ /dev/null @@ -1,1021 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx2) - -SYM_FUNC_START(chacha_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx2) - -SYM_FUNC_START(chacha_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S deleted file mode 100644 index 259383e1ad44..000000000000 --- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S +++ /dev/null @@ -1,836 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone2 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx512vl) - -SYM_FUNC_START(chacha_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone4 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx512vl) - -SYM_FUNC_START(chacha_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S deleted file mode 100644 index 7111949cd5b9..000000000000 --- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S +++ /dev/null @@ -1,791 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include -#include - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * The round count is given in %r8d. - * - * Clobbers: %r8d, %xmm4-%xmm7 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - sub $2,%r8d - jnz .Ldoubleround - - RET -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - FRAME_BEGIN - - # x0..3 = s0..3 - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - RET - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -SYM_FUNC_END(chacha_block_xor_ssse3) - -SYM_FUNC_START(hchacha_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - # %edx: nrounds - FRAME_BEGIN - - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - - mov %edx,%r8d - call chacha_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - RET -SYM_FUNC_END(hchacha_block_ssse3) - -SYM_FUNC_START(chacha_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four consecutive ChaCha blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - sub $2,%r8d - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c deleted file mode 100644 index 10b2c945f541..000000000000 --- a/arch/x86/lib/crypto/chacha_glue.c +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha and HChaCha functions (x86_64 optimized) - * - * Copyright (C) 2015 Martin Willi - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state->x[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd)) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, out, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl deleted file mode 100644 index 501827254fed..000000000000 --- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl +++ /dev/null @@ -1,4253 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2018 Samuel Neves . All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld . All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Initial release. -# -# December 2016 -# -# Add AVX512F+VL+BW code path. -# -# November 2017 -# -# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be -# executed even on Knights Landing. Trigger for modification was -# observation that AVX512 code paths can negatively affect overall -# Skylake-X system performance. Since we are likely to suppress -# AVX512F capability flag [at least on Skylake-X], conversion serves -# as kind of "investment protection". Note that next *lake processor, -# Cannonlake, has AVX512IFMA code path to execute... -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] -# Silvermont 2.83/+95% - -# Knights L 3.60/? 1.65 1.10 0.41(***) -# Goldmont 1.70/+180% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# Ryzen 1.15/+200% 1.08 1.18 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; -# (***) strangely enough performance seems to vary from core to core, -# listed result is best case; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -$code.=<<___ if $kernel; -#include -___ - -if ($avx) { -$code.=<<___ if $kernel; -.section .rodata -___ -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ -} -$code.=<<___ if (!$kernel); -.asciz "Poly1305 for x86_64, CRYPTOGAMS by " -.align 16 -___ - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); -my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text -___ -$code.=<<___ if (!$kernel); -.extern OPENSSL_ia32cap_P - -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch -.globl poly1305_blocks_x86_64 -.hidden poly1305_blocks_x86_64 -.globl poly1305_emit_x86_64 -.hidden poly1305_emit_x86_64 -___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); -$code.=<<___; - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - test $inp,$inp - je .Lno_key -___ -$code.=<<___ if (!$kernel); - lea poly1305_blocks_x86_64(%rip),%r10 - lea poly1305_emit_x86_64(%rip),%r11 -___ -$code.=<<___ if (!$kernel && $avx); - mov OPENSSL_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if (!$kernel && $avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___ if (!$kernel && $avx>3); - mov \$`(1<<31|1<<21|1<<16)`,%rax - shr \$32,%r9 - and %rax,%r9 - cmp %rax,%r9 - je .Linit_base2_44 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if (!$kernel && $flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if (!$kernel && $flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - RET -___ -&end_function("poly1305_block_init_arch"); - -&declare_function("poly1305_blocks_x86_64", 32, 4); -$code.=<<___; -.cfi_startproc -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $ctx -.cfi_push $ctx -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - - &poly1305_iteration(); - -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov 0(%rsp),$ctx -.cfi_restore $ctx - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data: -.Lblocks_epilogue: - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_x86_64"); - -&declare_function("poly1305_emit_x86_64", 32, 3); -$code.=<<___; -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_x86_64"); -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: - push $ctx -___ - &poly1305_iteration(); -$code.=<<___; - pop $ctx - RET -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - push %rbp - mov %rsp,%rbp - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - pop %rbp - RET -.size __poly1305_init_avx,.-__poly1305_init_avx -___ - -&declare_function("poly1305_blocks_avx", 32, 4); -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx: -.Lblocks_avx_epilogue: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - and \$-32,%rsp - sub \$-8,%rsp - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_avx"); - -&declare_function("poly1305_emit_avx", 32, 3); -$code.=<<___; - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_avx"); - -if ($avx>1) { - -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -sub poly1305_blocks_avxN { - my ($avx512) = @_; - my $suffix = $avx512 ? "_avx512" : ""; -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2$suffix - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2$suffix: - and \$-16,$len - jz .Lno_data_avx2$suffix - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2$suffix - - test \$63,$len - jz .Leven_avx2$suffix - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2$suffix - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2$suffix - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2$suffix - -.align 32 -.Lstore_base2_64_avx2$suffix: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2$suffix - -.align 16 -.Lstore_base2_26_avx2$suffix: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2$suffix: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx2$suffix: -.Lblocks_avx2_epilogue$suffix: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx2$suffix: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2$suffix - -.Lbase2_64_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2$suffix - -.Linit_avx2$suffix: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2$suffix: - mov %r15,$len # restore $len -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d - mov \$`(1<<31|1<<30|1<<16)`,%r11d -___ -$code.=<<___; - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx2_epilogue$suffix: - jmp .Ldo_avx2$suffix -.cfi_endproc - -.align 32 -.Leven_avx2$suffix: -.cfi_startproc -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d -___ -$code.=<<___; - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2$suffix: -___ -$code.=<<___ if (!$kernel && $avx>2); - cmp \$512,$len - jb .Lskip_avx512 - and %r11d,%r9d - test \$`1<<16`,%r9d # check for AVX512F - jnz .Lblocks_avx512 -.Lskip_avx512$suffix: -___ -$code.=<<___ if ($avx > 2 && $avx512 && $kernel); - cmp \$512,$len - jae .Lblocks_avx512 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx2_body$suffix: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - lea 0x90(%rsp),%rax # size optimization - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermd $T2,$T0,$T2 # 00003412 -> 14243444 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermd $T3,$T0,$T3 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermd $T4,$T0,$T4 - vmovdqa $T2,0x00(%rsp) - vpermd $D0,$T0,$D0 - vmovdqa $T3,0x20-0x90(%rax) - vpermd $D1,$T0,$D1 - vmovdqa $T4,0x40-0x90(%rax) - vpermd $D2,$T0,$D2 - vmovdqa $D0,0x60-0x90(%rax) - vpermd $D3,$T0,$D3 - vmovdqa $D1,0x80-0x90(%rax) - vpermd $D4,$T0,$D4 - vmovdqa $D2,0xa0-0x90(%rax) - vpermd $MASK,$T0,$MASK - vmovdqa $D3,0xc0-0x90(%rax) - vmovdqa $D4,0xe0-0x90(%rax) - vmovdqa $MASK,0x100-0x90(%rax) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2$suffix - jmp .Loop_avx2$suffix - -.align 32 -.Loop_avx2$suffix: - ################################################################ - # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 - # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 - # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 - # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 - # \________/\__________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2$suffix - - .byte 0x66,0x90 -.Ltail_avx2$suffix: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa -0xb0(%r10),%xmm6 - vmovdqa -0xa0(%r10),%xmm7 - vmovdqa -0x90(%r10),%xmm8 - vmovdqa -0x80(%r10),%xmm9 - vmovdqa -0x70(%r10),%xmm10 - vmovdqa -0x60(%r10),%xmm11 - vmovdqa -0x50(%r10),%xmm12 - vmovdqa -0x40(%r10),%xmm13 - vmovdqa -0x30(%r10),%xmm14 - vmovdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx2_epilogue$suffix: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -if($avx > 2 && $avx512) { -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); -my $PADBIT="%zmm30"; - -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); - -$code.=<<___; -.cfi_startproc -.Lblocks_avx512: - mov \$15,%eax - kmovw %eax,%k2 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx512_body: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 - - # expand pre-calculated table - vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} - mov \$0x20,%rax - vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} - vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} - vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} - vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} - vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} - vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} - vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} - vpermd $D0,$T2,$R0 # 00003412 -> 14243444 - vpbroadcastq 64(%rcx),$MASK # .Lmask26 - vpermd $D1,$T2,$R1 - vpermd $T0,$T2,$S1 - vpermd $D2,$T2,$R2 - vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 - vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $T1,$T2,$S2 - vmovdqu64 $R1,0x00(%rsp,%rax){%k2} - vpsrlq \$32,$R1,$T1 - vpermd $D3,$T2,$R3 - vmovdqa64 $S1,0x40(%rsp){%k2} - vpermd $T3,$T2,$S3 - vpermd $D4,$T2,$R4 - vmovdqu64 $R2,0x40(%rsp,%rax){%k2} - vpermd $T4,$T2,$S4 - vmovdqa64 $S2,0x80(%rsp){%k2} - vmovdqu64 $R3,0x80(%rsp,%rax){%k2} - vmovdqa64 $S3,0xc0(%rsp){%k2} - vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} - vmovdqa64 $S4,0x100(%rsp){%k2} - - ################################################################ - # calculate 5th through 8th powers of the key - # - # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 - # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 - # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 - # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 - # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 - - vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 - vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 - vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 - vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 - vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 - vpsrlq \$32,$R2,$T2 - - vpmuludq $T1,$S4,$M0 - vpmuludq $T1,$R0,$M1 - vpmuludq $T1,$R1,$M2 - vpmuludq $T1,$R2,$M3 - vpmuludq $T1,$R3,$M4 - vpsrlq \$32,$R3,$T3 - vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 - vpaddq $M1,$D1,$D1 # d1 += r1'*r0 - vpaddq $M2,$D2,$D2 # d2 += r1'*r1 - vpaddq $M3,$D3,$D3 # d3 += r1'*r2 - vpaddq $M4,$D4,$D4 # d4 += r1'*r3 - - vpmuludq $T2,$S3,$M0 - vpmuludq $T2,$S4,$M1 - vpmuludq $T2,$R1,$M3 - vpmuludq $T2,$R2,$M4 - vpmuludq $T2,$R0,$M2 - vpsrlq \$32,$R4,$T4 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 - vpaddq $M3,$D3,$D3 # d3 += r2'*r1 - vpaddq $M4,$D4,$D4 # d4 += r2'*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*r0 - - vpmuludq $T3,$S2,$M0 - vpmuludq $T3,$R0,$M3 - vpmuludq $T3,$R1,$M4 - vpmuludq $T3,$S3,$M1 - vpmuludq $T3,$S4,$M2 - vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 - vpaddq $M3,$D3,$D3 # d3 += r3'*r0 - vpaddq $M4,$D4,$D4 # d4 += r3'*r1 - vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 - vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 - - vpmuludq $T4,$S4,$M3 - vpmuludq $T4,$R0,$M4 - vpmuludq $T4,$S1,$M0 - vpmuludq $T4,$S2,$M1 - vpmuludq $T4,$S3,$M2 - vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 - vpaddq $M4,$D4,$D4 # d4 += r2'*r0 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 - - ################################################################ - # load input - vmovdqu64 16*0($inp),%z#$T3 - vmovdqu64 16*4($inp),%z#$T4 - lea 16*8($inp),$inp - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D4,$M4 - vpandq $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$M1 - vpandq $MASK,$D1,$D1 - vpaddq $M1,$D2,$D2 # d1 -> d2 - - vpaddq $M4,$D0,$D0 - vpsllq \$2,$M4,$M4 - vpaddq $M4,$D0,$D0 # d4 -> d0 - - vpsrlq \$26,$D2,$M2 - vpandq $MASK,$D2,$D2 - vpaddq $M2,$D3,$D3 # d2 -> d3 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - ################################################################ - # at this point we have 14243444 in $R0-$S4 and 05060708 in - # $D0-$D4, ... - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - # ... since input 64-bit lanes are ordered as 73625140, we could - # "vperm" it to 76543210 (here and in each loop iteration), *or* - # we could just flow along, hence the goal for $R0-$S4 is - # 1858286838784888 ... - - vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: - mov \$0x7777,%eax - kmovw %eax,%k1 - - vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- - vpermd $R1,$M0,$R1 - vpermd $R2,$M0,$R2 - vpermd $R3,$M0,$R3 - vpermd $R4,$M0,$R4 - - vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 - vpermd $D1,$M0,${R1}{%k1} - vpermd $D2,$M0,${R2}{%k1} - vpermd $D3,$M0,${R3}{%k1} - vpermd $D4,$M0,${R4}{%k1} - - vpslld \$2,$R1,$S1 # *5 - vpslld \$2,$R2,$S2 - vpslld \$2,$R3,$S3 - vpslld \$2,$R4,$S4 - vpaddd $R1,$S1,$S1 - vpaddd $R2,$S2,$S2 - vpaddd $R3,$S3,$S3 - vpaddd $R4,$S4,$S4 - - vpbroadcastq 32(%rcx),$PADBIT # .L129 - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - vporq $T3,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$14,$T4,$T3 - vpsrlq \$40,$T4,$T4 # 4 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$192,$len - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - ################################################################ - # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 - # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 - # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 - # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 - # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 - # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 - # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 - # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 - # \________/\___________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 - # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 - # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 - # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpaddq $H0,$T0,$H0 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu64 16*0($inp),$T3 # load input - vmovdqu64 16*4($inp),$T4 - lea 16*8($inp),$inp - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpaddq $M3,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$S4,$M2 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - - vpsrlq \$26,$D3,$H3 - vpandq $MASK,$D3,$D3 - vpaddq $H3,$D4,$H4 # h3 -> h4 - - vporq $T3,$T2,$T2 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpandq $MASK,$T2,$T2 # 2 - - vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpaddq $D2,$D3,$H3 # h2 -> h3 - - vpsrlq \$14,$T4,$T3 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - sub \$128,$len - ja .Loop_avx512 - -.Ltail_avx512: - ################################################################ - # while above multiplications were by r^8 in all lanes, in last - # iteration we multiply least significant lane by r^8 and most - # significant one by r, that's why table gets shifted... - - vpsrlq \$32,$R0,$R0 # 0105020603070408 - vpsrlq \$32,$R1,$R1 - vpsrlq \$32,$R2,$R2 - vpsrlq \$32,$S3,$S3 - vpsrlq \$32,$S4,$S4 - vpsrlq \$32,$R3,$R3 - vpsrlq \$32,$R4,$R4 - vpsrlq \$32,$S1,$S1 - vpsrlq \$32,$S2,$S2 - - ################################################################ - # load either next or last 64 byte of input - lea ($inp,$len),$inp - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu 16*0($inp),%x#$T0 - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vmovdqu 16*1($inp),%x#$T1 - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpmuludq $H3,$S4,$M2 - vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # horizontal addition - - mov \$1,%eax - vpermq \$0xb1,$H3,$D3 - vpermq \$0xb1,$D4,$H4 - vpermq \$0xb1,$H0,$D0 - vpermq \$0xb1,$H1,$D1 - vpermq \$0xb1,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - kmovw %eax,%k3 - vpermq \$0x2,$H3,$D3 - vpermq \$0x2,$H4,$D4 - vpermq \$0x2,$H0,$D0 - vpermq \$0x2,$H1,$D1 - vpermq \$0x2,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - vextracti64x4 \$0x1,$H3,%y#$D3 - vextracti64x4 \$0x1,$H4,%y#$D4 - vextracti64x4 \$0x1,$H0,%y#$D0 - vextracti64x4 \$0x1,$H1,%y#$D1 - vextracti64x4 \$0x1,$H2,%y#$D2 - vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case - vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 - vpaddq $D0,$H0,${H0}{%k3}{z} - vpaddq $D1,$H1,${H1}{%k3}{z} - vpaddq $D2,$H2,${H2}{%k3}{z} -___ -map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); -map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); -$code.=<<___; - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpand $MASK,$T1,$T1 # 1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - vpaddq $D3,$H4,$H4 # h3 -> h4 - - lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 - add \$64,$len - jnz .Ltail_avx2$suffix - - vpsubq $T2,$H2,$H2 # undo input accumulation - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) - vzeroall -___ -$code.=<<___ if ($win64); - movdqa -0xb0(%r10),%xmm6 - movdqa -0xa0(%r10),%xmm7 - movdqa -0x90(%r10),%xmm8 - movdqa -0x80(%r10),%xmm9 - movdqa -0x70(%r10),%xmm10 - movdqa -0x60(%r10),%xmm11 - movdqa -0x50(%r10),%xmm12 - movdqa -0x40(%r10),%xmm13 - movdqa -0x30(%r10),%xmm14 - movdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx512_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - RET -.cfi_endproc -___ - -} - -} - -&declare_function("poly1305_blocks_avx2", 32, 4); -poly1305_blocks_avxN(0); -&end_function("poly1305_blocks_avx2"); - -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - -&declare_function("poly1305_blocks_avx512", 32, 4); -poly1305_blocks_avxN(1); -&end_function("poly1305_blocks_avx512"); - -if (!$kernel && $avx>3) { -######################################################################## -# VPMADD52 version using 2^44 radix. -# -# One can argue that base 2^52 would be more natural. Well, even though -# some operations would be more natural, one has to recognize couple of -# things. Base 2^52 doesn't provide advantage over base 2^44 if you look -# at amount of multiply-n-accumulate operations. Secondly, it makes it -# impossible to pre-compute multiples of 5 [referred to as s[]/sN in -# reference implementations], which means that more such operations -# would have to be performed in inner loop, which in turn makes critical -# path longer. In other words, even though base 2^44 reduction might -# look less elegant, overall critical path is actually shorter... - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^44 -# unsigned __int64 s[2]; # key value*20 base 2^44 -# unsigned __int64 r[3]; # key value base 2^44 -# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; -# # r^n positions reflect -# # placement in register, not -# # memory, R[3] is R[1]*20 - -$code.=<<___; -.type poly1305_init_base2_44,\@function,3 -.align 32 -poly1305_init_base2_44: - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - -.Linit_base2_44: - lea poly1305_blocks_vpmadd52(%rip),%r10 - lea poly1305_emit_base2_44(%rip),%r11 - - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - mov \$0x00000fffffffffff,%r8 - and 8($inp),%rcx - mov \$0x00000fffffffffff,%r9 - and %rax,%r8 - shrd \$44,%rcx,%rax - mov %r8,40($ctx) # r0 - and %r9,%rax - shr \$24,%rcx - mov %rax,48($ctx) # r1 - lea (%rax,%rax,4),%rax # *5 - mov %rcx,56($ctx) # r2 - shl \$2,%rax # magic <<2 - lea (%rcx,%rcx,4),%rcx # *5 - shl \$2,%rcx # magic <<2 - mov %rax,24($ctx) # s1 - mov %rcx,32($ctx) # s2 - movq \$-1,64($ctx) # write impossible value -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax - RET -.size poly1305_init_base2_44,.-poly1305_init_base2_44 -___ -{ -my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); -my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); -my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52,\@function,4 -.align 32 -poly1305_blocks_vpmadd52: - shr \$4,$len - jz .Lno_data_vpmadd52 # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - # if powers of the key are not calculated yet, process up to 3 - # blocks with this single-block subroutine, otherwise ensure that - # length is divisible by 2 blocks and pass the rest down to next - # subroutine... - - mov \$3,%rax - mov \$1,%r10 - cmp \$4,$len # is input long - cmovae %r10,%rax - test %r8,%r8 # is power value impossible? - cmovns %r10,%rax - - and $len,%rax # is input of favourable length? - jz .Lblocks_vpmadd52_4x - - sub %rax,$len - mov \$7,%r10d - mov \$1,%r11d - kmovw %r10d,%k7 - lea .L2_44_inp_permd(%rip),%r10 - kmovw %r11d,%k1 - - vmovq $padbit,%x#$PAD - vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd - vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift - vpermq \$0xcf,$PAD,$PAD - vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask - - vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value - vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys - vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} - vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} - - vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt - vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft - - jmp .Loop_vpmadd52 - -.align 32 -.Loop_vpmadd52: - vmovdqu32 0($inp),%x#$T0 # load input as ----3210 - lea 16($inp),$inp - - vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 - vpsrlvq $inp_shift,$T0,$T0 - vpandq $reduc_mask,$T0,$T0 - vporq $PAD,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo # accumulate input - - vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value - vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} - vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} - - vpxord $Dlo,$Dlo,$Dlo - vpxord $Dhi,$Dhi,$Dhi - - vpmadd52luq $r2r1r0,$H0,$Dlo - vpmadd52huq $r2r1r0,$H0,$Dhi - - vpmadd52luq $r1r0s2,$H1,$Dlo - vpmadd52huq $r1r0s2,$H1,$Dhi - - vpmadd52luq $r0s2s1,$H2,$Dlo - vpmadd52huq $r0s2s1,$H2,$Dhi - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword - vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword - vpandq $reduc_mask,$Dlo,$Dlo - - vpaddq $T0,$Dhi,$Dhi - - vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword - - vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word - vpandq $reduc_mask,$Dlo,$Dlo - - vpermq \$0b10010011,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} - - vpaddq $T0,$Dlo,$Dlo - vpsllq \$2,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - dec %rax # len-=16 - jnz .Loop_vpmadd52 - - vmovdqu64 $Dlo,0($ctx){%k7} # store hash value - - test $len,$len - jnz .Lblocks_vpmadd52_4x - -.Lno_data_vpmadd52: - RET -.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 -___ -} -{ -######################################################################## -# As implied by its name 4x subroutine processes 4 blocks in parallel -# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power -# and is handled in 256-bit %ymm registers. - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_4x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_4x: - shr \$4,$len - jz .Lno_data_vpmadd52_4x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - -.Lblocks_vpmadd52_4x: - vpbroadcastq $padbit,$PAD - - vmovdqa64 .Lx_mask44(%rip),$mask44 - mov \$5,%eax - vmovdqa64 .Lx_mask42(%rip),$mask42 - kmovw %eax,%k1 # used in 2x path - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Lblocks_vpmadd52_2x_do - -.Lblocks_vpmadd52_4x_do: - vpbroadcastq 64($ctx),$R0 # load 4th power of the key - vpbroadcastq 96($ctx),$R1 - vpbroadcastq 128($ctx),$R2 - vpbroadcastq 160($ctx),$S1 - -.Lblocks_vpmadd52_4x_key_loaded: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - test \$7,$len # is len 8*n? - jz .Lblocks_vpmadd52_8x - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 3-1-2-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$4,$len - jz .Ltail_vpmadd52_4x - jmp .Loop_vpmadd52_4x - ud2 - -.align 32 -.Linit_vpmadd52: - vmovq 24($ctx),%x#$S1 # load key - vmovq 56($ctx),%x#$H2 - vmovq 32($ctx),%x#$S2 - vmovq 40($ctx),%x#$R0 - vmovq 48($ctx),%x#$R1 - - vmovdqa $R0,$H0 - vmovdqa $R1,$H1 - vmovdqa $H2,$R2 - - mov \$2,%eax - -.Lmul_init_vpmadd52: - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - dec %eax - jz .Ldone_init_vpmadd52 - - vpunpcklqdq $R1,$H1,$R1 # 1,2 - vpbroadcastq %x#$H1,%x#$H1 # 2,2 - vpunpcklqdq $R2,$H2,$R2 - vpbroadcastq %x#$H2,%x#$H2 - vpunpcklqdq $R0,$H0,$R0 - vpbroadcastq %x#$H0,%x#$H0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R1,$S1,$S1 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S1,$S1 - vpsllq \$2,$S2,$S2 - - jmp .Lmul_init_vpmadd52 - ud2 - -.align 32 -.Ldone_init_vpmadd52: - vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 - vinserti128 \$1,%x#$R2,$H2,$R2 - vinserti128 \$1,%x#$R0,$H0,$R0 - - vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 - vpermq \$0b11011000,$R2,$R2 - vpermq \$0b11011000,$R0,$R0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpaddq $R1,$S1,$S1 - vpsllq \$2,$S1,$S1 - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Ldone_init_vpmadd52_2x - - vmovdqu64 $R0,64($ctx) # save key powers - vpbroadcastq %x#$R0,$R0 # broadcast 4th power - vmovdqu64 $R1,96($ctx) - vpbroadcastq %x#$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpbroadcastq %x#$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpbroadcastq %x#$S1,$S1 - - jmp .Lblocks_vpmadd52_4x_key_loaded - ud2 - -.align 32 -.Ldone_init_vpmadd52_2x: - vmovdqu64 $R0,64($ctx) # save key powers - vpsrldq \$8,$R0,$R0 # 0-1-0-2 - vmovdqu64 $R1,96($ctx) - vpsrldq \$8,$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpsrldq \$8,$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpsrldq \$8,$S1,$S1 - jmp .Lblocks_vpmadd52_2x_key_loaded - ud2 - -.align 32 -.Lblocks_vpmadd52_2x_do: - vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers - vmovdqu64 160+8($ctx),${S1}{%k1}{z} - vmovdqu64 64+8($ctx),${R0}{%k1}{z} - vmovdqu64 96+8($ctx),${R1}{%k1}{z} - -.Lblocks_vpmadd52_2x_key_loaded: - vmovdqu64 16*0($inp),$T2 # load data - vpxorq $T3,$T3,$T3 - lea 16*2($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as x-1-x-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - jmp .Ltail_vpmadd52_2x - ud2 - -.align 32 -.Loop_vpmadd52_4x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$4,$len # len-=64 - jnz .Loop_vpmadd52_4x - -.Ltail_vpmadd52_4x: - vmovdqu64 128($ctx),$R2 # load all key powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - -.Ltail_vpmadd52_2x: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - # at this point $len is - # either 4*n+2 or 0... - sub \$2,$len # len-=32 - ja .Lblocks_vpmadd52_4x_do - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_4x: - RET -.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x -___ -} -{ -######################################################################## -# As implied by its name 8x subroutine processes 8 blocks in parallel... -# This is intermediate version, as it's used only in cases when input -# length is either 8*n, 8*n+1 or 8*n+2... - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); -my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_8x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_8x: - shr \$4,$len - jz .Lno_data_vpmadd52_8x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - vmovdqa64 .Lx_mask44(%rip),$mask44 - vmovdqa64 .Lx_mask42(%rip),$mask42 - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - -.Lblocks_vpmadd52_8x: - ################################################################ - # fist we calculate more key powers - - vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - vpbroadcastq %x#$R2,$RR2 # broadcast 4th power - vpbroadcastq %x#$R0,$RR0 - vpbroadcastq %x#$R1,$RR1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $RR2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $RR2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $RR2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $RR2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $RR2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $RR2,$R0,$D2hi - - vpmadd52luq $RR0,$R0,$D0lo - vpmadd52huq $RR0,$R0,$D0hi - vpmadd52luq $RR0,$R1,$D1lo - vpmadd52huq $RR0,$R1,$D1hi - vpmadd52luq $RR0,$R2,$D2lo - vpmadd52huq $RR0,$R2,$D2hi - - vpmadd52luq $RR1,$S2,$D0lo - vpmadd52huq $RR1,$S2,$D0hi - vpmadd52luq $RR1,$R0,$D1lo - vpmadd52huq $RR1,$R0,$D1hi - vpmadd52luq $RR1,$R1,$D2lo - vpmadd52huq $RR1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$RR0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$RR1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$RR2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - - vpsrlq \$44,$RR0,$tmp # additional step - vpandq $mask44,$RR0,$RR0 - - vpaddq $tmp,$RR1,$RR1 - - ################################################################ - # At this point Rx holds 1324 powers, RRx - 5768, and the goal - # is 15263748, which reflects how data is loaded... - - vpunpcklqdq $R2,$RR2,$T2 # 3748 - vpunpckhqdq $R2,$RR2,$R2 # 1526 - vpunpcklqdq $R0,$RR0,$T0 - vpunpckhqdq $R0,$RR0,$R0 - vpunpcklqdq $R1,$RR1,$T1 - vpunpckhqdq $R1,$RR1,$R1 -___ -######## switch to %zmm -map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); -map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); - -$code.=<<___; - vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 - vshufi64x2 \$0x44,$R0,$T0,$RR0 - vshufi64x2 \$0x44,$R1,$T1,$RR1 - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - - vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 - vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 - vpaddq $RR2,$SS2,$SS2 - vpaddq $RR1,$SS1,$SS1 - vpsllq \$2,$SS2,$SS2 - vpsllq \$2,$SS1,$SS1 - - vpbroadcastq $padbit,$PAD - vpbroadcastq %x#$mask44,$mask44 - vpbroadcastq %x#$mask42,$mask42 - - vpbroadcastq %x#$SS1,$S1 # broadcast 8th power - vpbroadcastq %x#$SS2,$S2 - vpbroadcastq %x#$RR0,$R0 - vpbroadcastq %x#$RR1,$R1 - vpbroadcastq %x#$RR2,$R2 - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 73625140 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$8,$len - jz .Ltail_vpmadd52_8x - jmp .Loop_vpmadd52_8x - -.align 32 -.Loop_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$8,$len # len-=128 - jnz .Loop_vpmadd52_8x - -.Ltail_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$SS1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$SS1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$SS2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$SS2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$RR0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$RR0,$D2hi - - vpmadd52luq $H0,$RR0,$D0lo - vpmadd52huq $H0,$RR0,$D0hi - vpmadd52luq $H0,$RR1,$D1lo - vpmadd52huq $H0,$RR1,$D1hi - vpmadd52luq $H0,$RR2,$D2lo - vpmadd52huq $H0,$RR2,$D2hi - - vpmadd52luq $H1,$SS2,$D0lo - vpmadd52huq $H1,$SS2,$D0hi - vpmadd52luq $H1,$RR0,$D1lo - vpmadd52huq $H1,$RR0,$D1hi - vpmadd52luq $H1,$RR1,$D2lo - vpmadd52huq $H1,$RR1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vextracti64x4 \$1,$D0lo,%y#$T0 - vextracti64x4 \$1,$D0hi,%y#$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vextracti64x4 \$1,$D1lo,%y#$T1 - vextracti64x4 \$1,$D1hi,%y#$H1 - vextracti64x4 \$1,$D2lo,%y#$T2 - vextracti64x4 \$1,$D2hi,%y#$H2 -___ -######## switch back to %ymm -map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); - -$code.=<<___; - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - ################################################################ - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_8x: - RET -.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x -___ -} -$code.=<<___; -.type poly1305_emit_base2_44,\@function,3 -.align 32 -poly1305_emit_base2_44: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r9,%rax - shr \$20,%r9 - shl \$44,%rax - mov %r10,%rcx - shr \$40,%r10 - shl \$24,%rcx - - add %rax,%r8 - adc %rcx,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 -___ -} } } -} - -if (!$kernel) -{ # chacha20-poly1305 helpers -my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order -$code.=<<___; -.globl xor128_encrypt_n_pad -.type xor128_encrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_encrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_enc - nop -.Loop_enc_xmm: - movdqu ($inp,$otp),%xmm0 - pxor ($otp),%xmm0 - movdqu %xmm0,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_enc_xmm - - and \$15,%r10 # len % 16 - jz .Ldone_enc - -.Ltail_enc: - mov \$16,$len - sub %r10,$len - xor %eax,%eax -.Loop_enc_byte: - mov ($inp,$otp),%al - xor ($otp),%al - mov %al,($out,$otp) - mov %al,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_enc_byte - - xor %eax,%eax -.Loop_enc_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_enc_pad - -.Ldone_enc: - mov $otp,%rax - RET -.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad - -.globl xor128_decrypt_n_pad -.type xor128_decrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_decrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_dec - nop -.Loop_dec_xmm: - movdqu ($inp,$otp),%xmm0 - movdqa ($otp),%xmm1 - pxor %xmm0,%xmm1 - movdqu %xmm1,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_dec_xmm - - pxor %xmm1,%xmm1 - and \$15,%r10 # len % 16 - jz .Ldone_dec - -.Ltail_dec: - mov \$16,$len - sub %r10,$len - xor %eax,%eax - xor %r11d,%r11d -.Loop_dec_byte: - mov ($inp,$otp),%r11b - mov ($otp),%al - xor %r11b,%al - mov %al,($out,$otp) - mov %r11b,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_dec_byte - - xor %eax,%eax -.Loop_dec_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_dec_pad - -.Ldone_dec: - mov $otp,%rax - RET -.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad -___ -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->RipRsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - RET -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch - - .rva .LSEH_begin_poly1305_blocks_x86_64 - .rva .LSEH_end_poly1305_blocks_x86_64 - .rva .LSEH_info_poly1305_blocks_x86_64 - - .rva .LSEH_begin_poly1305_emit_x86_64 - .rva .LSEH_end_poly1305_emit_x86_64 - .rva .LSEH_info_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_poly1305_blocks_avx512 - .rva .LSEH_end_poly1305_blocks_avx512 - .rva .LSEH_info_poly1305_blocks_avx512 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_poly1305_block_init_arch: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch - -.LSEH_info_poly1305_blocks_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_poly1305_emit_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -$code.=<<___ if ($avx>2); -.LSEH_info_poly1305_blocks_avx512: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] -___ -} - -open SELF,$0; -while() { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c deleted file mode 100644 index b7e78a583e07..000000000000 --- a/arch/x86/lib/crypto/poly1305_glue.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) -{ - struct poly1305_arch_internal *ctx = - container_of(&state->h.h, struct poly1305_arch_internal, h); - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx)) { - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const unsigned int bytes = min(len, SZ_4K); - - kernel_fpu_begin(); - if (static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return 0; -} -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld "); -MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S deleted file mode 100644 index 0d7b2c3e45d9..000000000000 --- a/arch/x86/lib/crypto/sha256-avx-asm.S +++ /dev/null @@ -1,499 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX1 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 block at a time, with 4 lanes per block -######################################################################## - -#include -#include - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -.macro MY_ROR p1 p2 - shld $(32-(\p1)), \p2, \p2 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 -XTMP5 = %xmm11 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm13 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpsrld $7, XTMP1, XTMP2 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpslld $(32-7), XTMP1, XTMP3 - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (22-13), y1 # y1 = a >> (22-13) - vpsrld $18, XTMP1, XTMP2 # - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpslld $(32-18), XTMP1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP1, XTMP3, XTMP3 # - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP3, XTMP2, XTMP2 # - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP3, XTMP2, XTMP2 - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER # - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_avx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rsp, %rbp - - subq $STACK_SIZE, %rsp # allocate stack space - and $~15, %rsp # align stack pointer - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, _INP_END(%rsp) - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 1*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 2*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 3*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd 1*16(TBL), X1, XFER - vmovdqa XFER, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X2, X0 - vmovdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_avx) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S deleted file mode 100644 index 25d3380321ec..000000000000 --- a/arch/x86/lib/crypto/sha256-avx2-asm.S +++ /dev/null @@ -1,774 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 2 blocks at a time, with 4 lanes per block -######################################################################## - -#include -#include - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -X0 = %ymm4 -X1 = %ymm5 -X2 = %ymm6 -X3 = %ymm7 - -# XMM versions of above -XWORD0 = %xmm4 -XWORD1 = %xmm5 -XWORD2 = %xmm6 -XWORD3 = %xmm7 - -XTMP0 = %ymm0 -XTMP1 = %ymm1 -XTMP2 = %ymm2 -XTMP3 = %ymm3 -XTMP4 = %ymm8 -XFER = %ymm9 -XTMP5 = %ymm11 - -SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %ymm13 - -X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg -c = %ecx -d = %r8d -e = %edx # clobbers NUM_BLKS -y3 = %esi # clobbers INP - -SRND = CTX # SRND is same register as CTX - -a = %eax -b = %ebx -f = %r9d -g = %r10d -h = %r11d -old_h = %r11d - -T1 = %r12d -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -STACK_SIZE = _CTX + _CTX_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs - X_ = X0 - X0 = X1 - X1 = X2 - X2 = X3 - X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED disp -################################### RND N + 0 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - vpsrld $7, XTMP1, XTMP2 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - vpslld $(32-7), XTMP1, XTMP3 - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 - - vpsrld $18, XTMP1, XTMP2 - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 1 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 1*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - vpslld $(32-18), XTMP1, XTMP1 - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - - vpxor XTMP1, XTMP3, XTMP3 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - - - ROTATE_ARGS - -################################### RND N + 2 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - offset = \disp + 2*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - rorx $11, e, y1 # y1 = e >> 11 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - and e, y2 # y2 = (f^g)&e # CH - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - vpxor XTMP3, XTMP2, XTMP2 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a ,T1 # T1 = (a >> 2) # S0 - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1,h # h = k + w + h + S0 # -- - add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3,h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 3 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 3*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP3, XTMP2, XTMP2 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $2, a, T1 # T1 = (a >> 2) # S0 - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS disp -################################### RND N + 0 ########################### - - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 1 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*1 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 2 ############################## - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*2 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 3 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*3 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - -.endm - -######################################################################## -## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_rorx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - push %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $-32, %rsp # align rsp to 32 byte boundary - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block - mov NUM_BLKS, _INP_END(%rsp) - - cmp NUM_BLKS, INP - je .Lonly_one_block - - ## load initial digest - mov (CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - -.Lloop0: - ## Load first 16 dwords from two blocks - VMOVDQ 0*32(INP),XTMP0 - VMOVDQ 1*32(INP),XTMP1 - VMOVDQ 2*32(INP),XTMP2 - VMOVDQ 3*32(INP),XTMP3 - - ## byte swap data - vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 - vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 - vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 - vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 - - ## transpose data into high/low halves - vperm2i128 $0x20, XTMP2, XTMP0, X0 - vperm2i128 $0x31, XTMP2, XTMP0, X1 - vperm2i128 $0x20, XTMP3, XTMP1, X2 - vperm2i128 $0x31, XTMP3, XTMP1, X3 - -.Llast_block_enter: - add $64, INP - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 12 each - xor SRND, SRND - -.align 16 -.Lloop1: - leaq K256+0*32(%rip), INP ## reuse INP as scratch reg - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) - - leaq K256+2*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) - - leaq K256+3*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) - - add $4*32, SRND - cmp $3*4*32, SRND - jb .Lloop1 - -.Lloop2: - ## Do last 16 rounds with no scheduling - leaq K256+0*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X1, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 1*32) - add $2*32, SRND - - vmovdqa X2, X0 - vmovdqa X3, X1 - - cmp $4*4*32, SRND - jb .Lloop2 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - ja .Ldone_hash - - #### Do second block using previously scheduled results - xor SRND, SRND -.align 16 -.Lloop3: - DO_4ROUNDS (_XFER + 0*32 + 16) - DO_4ROUNDS (_XFER + 1*32 + 16) - add $2*32, SRND - cmp $4*4*32, SRND - jb .Lloop3 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - add $64, INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - jb .Lloop0 - ja .Ldone_hash - -.Ldo_last_block: - VMOVDQ 0*16(INP),XWORD0 - VMOVDQ 1*16(INP),XWORD1 - VMOVDQ 2*16(INP),XWORD2 - VMOVDQ 3*16(INP),XWORD3 - - vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 - vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 - vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 - vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - jmp .Llast_block_enter - -.Lonly_one_block: - - ## load initial digest - mov (4*0)(CTX),a - mov (4*1)(CTX),b - mov (4*2)(CTX),c - mov (4*3)(CTX),d - mov (4*4)(CTX),e - mov (4*5)(CTX),f - mov (4*6)(CTX),g - mov (4*7)(CTX),h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - jmp .Ldo_last_block - -.Ldone_hash: - - mov %rbp, %rsp - pop %rbp - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - vzeroupper - RET -SYM_FUNC_END(sha256_transform_rorx) - -.section .rodata.cst512.K256, "aM", @progbits, 512 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 - -# shuffle xBxA -> 00BA -.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 -.align 32 -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 - -# shuffle xDxC -> DC00 -.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 -.align 32 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S deleted file mode 100644 index d3548206cf3d..000000000000 --- a/arch/x86/lib/crypto/sha256-ni-asm.S +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley - * Tim Chen - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include - -#define STATE_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 /* sha256rnds2 implicit operand */ -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define TMP %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -.macro do_4rounds i, m0, m1, m2, m3 -.if \i < 16 - movdqu \i*4(DATA_PTR), \m0 - pshufb SHUF_MASK, \m0 -.endif - movdqa (\i-32)*4(SHA256CONSTANTS), MSG - paddd \m0, MSG - sha256rnds2 STATE0, STATE1 -.if \i >= 12 && \i < 60 - movdqa \m0, TMP - palignr $4, \m3, TMP - paddd TMP, \m1 - sha256msg2 \m0, \m1 -.endif - punpckhqdq MSG, MSG - sha256rnds2 STATE1, STATE0 -.if \i >= 4 && \i < 52 - sha256msg1 \m0, \m3 -.endif -.endm - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 block function - * - * This function takes a pointer to the current SHA-256 state, a pointer to the - * input data, and the number of 64-byte blocks to process. Once all blocks - * have been processed, the state is updated with the new state. This function - * only processes complete blocks. State initialization, buffering of partial - * blocks, and digest finalization is expected to be handled elsewhere. - * - * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ -.text -SYM_FUNC_START(sha256_ni_transform) - ANNOTATE_NOENDBR # since this is called only via static_call - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ - movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ - - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* FEBA */ - punpckhqdq TMP, STATE1 /* DCHG */ - pshufd $0x1B, STATE0, STATE0 /* ABEF */ - pshufd $0xB1, STATE1, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256+32*4(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - -.irp i, 0, 16, 32, 48 - do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 - do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 - do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 - do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 -.endr - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* GHEF */ - punpckhqdq TMP, STATE1 /* ABCD */ - pshufd $0xB1, STATE0, STATE0 /* HGFE */ - pshufd $0x1B, STATE1, STATE1 /* DCBA */ - - movdqu STATE1, 0*16(STATE_PTR) - movdqu STATE0, 1*16(STATE_PTR) - -.Ldone_hash: - - RET -SYM_FUNC_END(sha256_ni_transform) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S deleted file mode 100644 index 7f24a4cdcb25..000000000000 --- a/arch/x86/lib/crypto/sha256-ssse3-asm.S +++ /dev/null @@ -1,511 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford -# Kirk Yap -# Tim Chen -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include -#include - -## assume buffers not aligned -#define MOVDQ movdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p2, \p1 - pshufb \p3, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm12 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - movdqa X3, XTMP0 - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - palignr $4, X2, XTMP0 # XTMP0 = W[-7] - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa X1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - palignr $4, X0, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - movdqa XTMP1, XTMP2 # XTMP2 = W[-15] - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH - movdqa XTMP1, XTMP3 # XTMP3 = W[-15] - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pslld $(32-7), XTMP1 # - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - psrld $7, XTMP2 # - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP3, XTMP2 # XTMP2 = W[-15] - mov e, y0 # y0 = e - mov a, y1 # y1 = a - movdqa XTMP3, XTMP4 # XTMP4 = W[-15] - ror $(25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(22-13), y1 # y1 = a >> (22-13) - pslld $(32-18), XTMP3 # - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - psrld $18, XTMP2 # - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pxor XTMP4, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} - mov e, y0 # y0 = e - mov a, y1 # y1 = a - ror $(25-11), y0 # y0 = e >> (25-11) - movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP2 - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - movdqa XTMP2, X0 # X0 = W[-2] {DDCC} - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 - xor g, y2 # y2 = CH = ((f^g)&e)^g - pxor XTMP3, XTMP2 # - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, X0 # X0 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_DC00, X0 # X0 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_ssse3) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $~15, %rsp - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS - mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - movdqa _SHUF_00BA(%rip), SHUF_00BA - movdqa _SHUF_DC00(%rip), SHUF_DC00 - -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - movdqa (TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 1*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 2*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 3*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - paddd (TBL), X0 - movdqa X0, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd 1*16(TBL), X1 - movdqa X1, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X2, X0 - movdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - - RET -SYM_FUNC_END(sha256_transform_ssse3) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c deleted file mode 100644 index 80380f8fdcee..000000000000 --- a/arch/x86/lib/crypto/sha256.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for x86_64 - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include - -asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); - -DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_x86)) { - kernel_fpu_begin(); - static_call(sha256_blocks_x86)(state, data, nblocks); - kernel_fpu_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_x86); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_x86_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - static_call_update(sha256_blocks_x86, sha256_ni_transform); - } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | - XFEATURE_MASK_YMM, NULL) && - boot_cpu_has(X86_FEATURE_AVX)) { - if (boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - static_call_update(sha256_blocks_x86, - sha256_transform_rorx); - else - static_call_update(sha256_blocks_x86, - sha256_transform_avx); - } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { - return 0; - } - static_branch_enable(&have_sha256_x86); - return 0; -} -subsys_initcall(sha256_x86_mod_init); - -static void __exit sha256_x86_mod_exit(void) -{ -} -module_exit(sha256_x86_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index b98543c7ef23..2460ddff967f 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -211,7 +211,7 @@ if SPARC source "lib/crypto/sparc/Kconfig" endif if X86 -source "arch/x86/lib/crypto/Kconfig" +source "lib/crypto/x86/Kconfig" endif endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 7c1e7d06cac9..5823137fa5a8 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -114,3 +114,4 @@ obj-$(CONFIG_PPC) += powerpc/ obj-$(CONFIG_RISCV) += riscv/ obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_SPARC) += sparc/ +obj-$(CONFIG_X86) += x86/ diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore new file mode 100644 index 000000000000..580c839bb177 --- /dev/null +++ b/lib/crypto/x86/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-x86_64-cryptogams.S diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig new file mode 100644 index 000000000000..5e94cdee492c --- /dev/null +++ b/lib/crypto/x86/Kconfig @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_X86 + bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" + depends on 64BIT + select CRYPTO_LIB_BLAKE2S_GENERIC + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX-512 (Advanced Vector Extensions-512) + +config CRYPTO_CHACHA20_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + +config CRYPTO_SHA256_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD + select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile new file mode 100644 index 000000000000..abceca3d31c0 --- /dev/null +++ b/lib/crypto/x86/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o +libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o +chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +targets += poly1305-x86_64-cryptogams.S + +obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o +sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ + +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S new file mode 100644 index 000000000000..ac1c845445a4 --- /dev/null +++ b/lib/crypto/x86/blake2s-core.S @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves . All Rights Reserved. + */ + +#include + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 + +.text +SYM_FUNC_START(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + RET +SYM_FUNC_END(blake2s_compress_ssse3) + +SYM_FUNC_START(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + RET +SYM_FUNC_END(blake2s_compress_avx512) diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s-glue.c new file mode 100644 index 000000000000..adc296cd17c9 --- /dev/null +++ b/lib/crypto/x86/blake2s-glue.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + do { + const size_t blocks = min_t(size_t, nblocks, + SZ_4K / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + block += blocks * BLAKE2S_BLOCK_SIZE; + } while (nblocks); +} +EXPORT_SYMBOL(blake2s_compress); + +static int __init blake2s_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + static_branch_enable(&blake2s_use_ssse3); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return 0; +} + +subsys_initcall(blake2s_mod_init); diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S new file mode 100644 index 000000000000..f3d8fc018249 --- /dev/null +++ b/lib/crypto/x86/chacha-avx2-x86_64.S @@ -0,0 +1,1021 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + */ + +#include + +.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.text + +SYM_FUNC_START(chacha_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + RET + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +SYM_FUNC_END(chacha_2block_xor_avx2) + +SYM_FUNC_START(chacha_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_avx2) + +SYM_FUNC_START(chacha_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + lea 8(%rsp),%r10 + and $~31, %rsp + sub $0x80, %rsp + mov %rcx,%rax + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + lea -8(%r10),%rsp + RET + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + +SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S new file mode 100644 index 000000000000..259383e1ad44 --- /dev/null +++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S @@ -0,0 +1,836 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +SYM_FUNC_START(chacha_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + RET + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone2 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +SYM_FUNC_END(chacha_2block_xor_avx512vl) + +SYM_FUNC_START(chacha_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone4 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_avx512vl) + +SYM_FUNC_START(chacha_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + RET + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S new file mode 100644 index 000000000000..7111949cd5b9 --- /dev/null +++ b/lib/crypto/x86/chacha-ssse3-x86_64.S @@ -0,0 +1,791 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + */ + +#include +#include + +.section .rodata.cst16.ROT8, "aM", @progbits, 16 +.align 16 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * The round count is given in %r8d. + * + * Clobbers: %r8d, %xmm4-%xmm7 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + +.Ldoubleround: + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + sub $2,%r8d + jnz .Ldoubleround + + RET +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + FRAME_BEGIN + + # x0..3 = s0..3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha_permute + + # o0 = i0 ^ (x0 + s0) + paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + paddd %xmm10,%xmm2 + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + paddd %xmm11,%xmm3 + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: + FRAME_END + RET + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + +SYM_FUNC_END(chacha_block_xor_ssse3) + +SYM_FUNC_START(hchacha_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + # %edx: nrounds + FRAME_BEGIN + + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + RET +SYM_FUNC_END(hchacha_block_ssse3) + +SYM_FUNC_START(chacha_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four consecutive ChaCha blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + lea 8(%rsp),%r10 + sub $0x80,%rsp + and $~63,%rsp + mov %rcx,%rax + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + sub $2,%r8d + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) + +.Ldone4: + lea -8(%r10),%rsp + RET + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + +SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c new file mode 100644 index 000000000000..10b2c945f541 --- /dev/null +++ b/lib/crypto/x86/chacha_glue.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha and HChaCha functions (x86_64 optimized) + * + * Copyright (C) 2015 Martin Willi + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (static_branch_likely(&chacha_use_avx512vl)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + if (static_branch_likely(&chacha_use_avx2)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state->x[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd)) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, out, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd) || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&chacha_use_simd); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&chacha_use_simd); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } + return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl new file mode 100644 index 000000000000..501827254fed --- /dev/null +++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl @@ -0,0 +1,4253 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# Copyright (C) 2017-2018 Samuel Neves . All Rights Reserved. +# Copyright (C) 2017-2019 Jason A. Donenfeld . All Rights Reserved. +# Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannonlake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); +$kernel=0; $kernel=1 if (!$flavour && !$output); + +if (!$kernel) { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); + } + + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); + } + + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} else { + $avx = 4; # The kernel uses ifdefs for this. +} + +sub declare_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= "SYM_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + +sub declare_typed_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= "SYM_TYPED_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + +sub end_function() { + my ($name) = @_; + if($kernel) { + $code .= "SYM_FUNC_END($name)\n"; + } else { + $code .= ".size $name,.-$name\n"; + } +} + +$code.=<<___ if $kernel; +#include +___ + +if ($avx) { +$code.=<<___ if $kernel; +.section .rodata +___ +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___ if (!$kernel); +.asciz "Poly1305 for x86_64, CRYPTOGAMS by " +.align 16 +___ + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text +___ +$code.=<<___ if (!$kernel); +.extern OPENSSL_ia32cap_P + +.globl poly1305_block_init_arch +.hidden poly1305_block_init_arch +.globl poly1305_blocks_x86_64 +.hidden poly1305_blocks_x86_64 +.globl poly1305_emit_x86_64 +.hidden poly1305_emit_x86_64 +___ +&declare_typed_function("poly1305_block_init_arch", 32, 3); +$code.=<<___; + xor %eax,%eax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + test $inp,$inp + je .Lno_key +___ +$code.=<<___ if (!$kernel); + lea poly1305_blocks_x86_64(%rip),%r10 + lea poly1305_emit_x86_64(%rip),%r11 +___ +$code.=<<___ if (!$kernel && $avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if (!$kernel && $avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if (!$kernel && $avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + RET +___ +&end_function("poly1305_block_init_arch"); + +&declare_function("poly1305_blocks_x86_64", 32, 4); +$code.=<<___; +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $ctx +.cfi_push $ctx +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + + &poly1305_iteration(); + +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov 0(%rsp),$ctx +.cfi_restore $ctx + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + RET +.cfi_endproc +___ +&end_function("poly1305_blocks_x86_64"); + +&declare_function("poly1305_emit_x86_64", 32, 3); +$code.=<<___; +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +___ +&end_function("poly1305_emit_x86_64"); +if ($avx) { + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: + push $ctx +___ + &poly1305_iteration(); +$code.=<<___; + pop $ctx + RET +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: + push %rbp + mov %rsp,%rbp + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + pop %rbp + RET +.size __poly1305_init_avx,.-__poly1305_init_avx +___ + +&declare_function("poly1305_blocks_avx", 32, 4); +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx: +.Lblocks_avx_epilogue: + RET +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + and \$-32,%rsp + sub \$-8,%rsp + lea -0x58(%rsp),%r11 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + RET +.cfi_endproc +___ +&end_function("poly1305_blocks_avx"); + +&declare_function("poly1305_emit_avx", 32, 3); +$code.=<<___; + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +___ +&end_function("poly1305_emit_avx"); + +if ($avx>1) { + +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +sub poly1305_blocks_avxN { + my ($avx512) = @_; + my $suffix = $avx512 ? "_avx512" : ""; +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2$suffix + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2$suffix: + and \$-16,$len + jz .Lno_data_avx2$suffix + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2$suffix + + test \$63,$len + jz .Leven_avx2$suffix + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2$suffix + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2$suffix + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2$suffix + +.align 32 +.Lstore_base2_64_avx2$suffix: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2$suffix + +.align 16 +.Lstore_base2_26_avx2$suffix: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2$suffix: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx2$suffix: +.Lblocks_avx2_epilogue$suffix: + RET +.cfi_endproc + +.align 32 +.Lbase2_64_avx2$suffix: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2$suffix + +.Lbase2_64_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2$suffix + +.Linit_avx2$suffix: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2$suffix: + mov %r15,$len # restore $len +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d + mov \$`(1<<31|1<<30|1<<16)`,%r11d +___ +$code.=<<___; + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx2_epilogue$suffix: + jmp .Ldo_avx2$suffix +.cfi_endproc + +.align 32 +.Leven_avx2$suffix: +.cfi_startproc +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d +___ +$code.=<<___; + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2$suffix: +___ +$code.=<<___ if (!$kernel && $avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r9d + test \$`1<<16`,%r9d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512$suffix: +___ +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); + cmp \$512,$len + jae .Lblocks_avx512 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx2_body$suffix: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2$suffix + jmp .Loop_avx2$suffix + +.align 32 +.Loop_avx2$suffix: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2$suffix + + .byte 0x66,0x90 +.Ltail_avx2$suffix: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa -0xb0(%r10),%xmm6 + vmovdqa -0xa0(%r10),%xmm7 + vmovdqa -0x90(%r10),%xmm8 + vmovdqa -0x80(%r10),%xmm9 + vmovdqa -0x70(%r10),%xmm10 + vmovdqa -0x60(%r10),%xmm11 + vmovdqa -0x50(%r10),%xmm12 + vmovdqa -0x40(%r10),%xmm13 + vmovdqa -0x30(%r10),%xmm14 + vmovdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx2_epilogue$suffix: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + RET +.cfi_endproc +___ +if($avx > 2 && $avx512) { +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2$suffix + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa -0xb0(%r10),%xmm6 + movdqa -0xa0(%r10),%xmm7 + movdqa -0x90(%r10),%xmm8 + movdqa -0x80(%r10),%xmm9 + movdqa -0x70(%r10),%xmm10 + movdqa -0x60(%r10),%xmm11 + movdqa -0x50(%r10),%xmm12 + movdqa -0x40(%r10),%xmm13 + movdqa -0x30(%r10),%xmm14 + movdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + RET +.cfi_endproc +___ + +} + +} + +&declare_function("poly1305_blocks_avx2", 32, 4); +poly1305_blocks_avxN(0); +&end_function("poly1305_blocks_avx2"); + +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +&declare_function("poly1305_blocks_avx512", 32, 4); +poly1305_blocks_avxN(1); +&end_function("poly1305_blocks_avx512"); + +if (!$kernel && $avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %eax,%eax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + RET +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + RET +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + RET +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + RET +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + RET +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +} + +if (!$kernel) +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + RET +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11d,%r11d +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + RET +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + RET +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_block_init_arch + .rva .LSEH_end_poly1305_block_init_arch + .rva .LSEH_info_poly1305_block_init_arch + + .rva .LSEH_begin_poly1305_blocks_x86_64 + .rva .LSEH_end_poly1305_blocks_x86_64 + .rva .LSEH_info_poly1305_blocks_x86_64 + + .rva .LSEH_begin_poly1305_emit_x86_64 + .rva .LSEH_end_poly1305_emit_x86_64 + .rva .LSEH_info_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_block_init_arch: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch + +.LSEH_info_poly1305_blocks_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + if ($kernel) { + s/(^\.type.*),[0-9]+$/\1/; + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; + next if /^\.cfi.*/; + } + + print $_,"\n"; +} +close STDOUT; diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c new file mode 100644 index 000000000000..b7e78a583e07 --- /dev/null +++ b/lib/crypto/x86/poly1305_glue.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) +{ + struct poly1305_arch_internal *ctx = + container_of(&state->h.h, struct poly1305_arch_internal, h); + + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); + + if (!static_branch_likely(&poly1305_use_avx)) { + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; + } + + do { + const unsigned int bytes = min(len, SZ_4K); + + kernel_fpu_begin(); + if (static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + + len -= bytes; + inp += bytes; + } while (len); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +{ + if (!static_branch_likely(&poly1305_use_avx)) + poly1305_emit_x86_64(ctx, mac, nonce); + else + poly1305_emit_avx(ctx, mac, nonce); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&poly1305_use_avx); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_simd_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); + return 0; +} +subsys_initcall(poly1305_simd_mod_init); + +static void __exit poly1305_simd_mod_exit(void) +{ +} +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jason A. Donenfeld "); +MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S new file mode 100644 index 000000000000..0d7b2c3e45d9 --- /dev/null +++ b/lib/crypto/x86/sha256-avx-asm.S @@ -0,0 +1,499 @@ +######################################################################## +# Implement fast SHA-256 with AVX1 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 block at a time, with 4 lanes per block +######################################################################## + +#include +#include + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +.macro MY_ROR p1 p2 + shld $(32-(\p1)), \p2, \p2 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 +XTMP5 = %xmm11 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm13 + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg + +SRND = %rsi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %r12 +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpsrld $7, XTMP1, XTMP2 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpslld $(32-7), XTMP1, XTMP3 + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (22-13), y1 # y1 = a >> (22-13) + vpsrld $18, XTMP1, XTMP2 # + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpslld $(32-18), XTMP1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP1, XTMP3, XTMP3 # + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP3, XTMP2, XTMP2 # + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP3, XTMP2, XTMP2 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER # + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_avx) + ANNOTATE_NOENDBR # since this is called only via static_call + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + movq %rsp, %rbp + + subq $STACK_SIZE, %rsp # allocate stack space + and $~15, %rsp # align stack pointer + + shl $6, NUM_BLKS # convert to bytes + jz .Ldone_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, _INP_END(%rsp) + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 +.Lloop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +.Lloop1: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 1*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 2*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 3*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne .Lloop1 + + mov $2, SRND +.Lloop2: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd 1*16(TBL), X1, XFER + vmovdqa XFER, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X2, X0 + vmovdqa X3, X1 + + sub $1, SRND + jne .Lloop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne .Lloop0 + +.Ldone_hash: + + mov %rbp, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + RET +SYM_FUNC_END(sha256_transform_avx) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S new file mode 100644 index 000000000000..25d3380321ec --- /dev/null +++ b/lib/crypto/x86/sha256-avx2-asm.S @@ -0,0 +1,774 @@ +######################################################################## +# Implement fast SHA-256 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 2 blocks at a time, with 4 lanes per block +######################################################################## + +#include +#include + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +X0 = %ymm4 +X1 = %ymm5 +X2 = %ymm6 +X3 = %ymm7 + +# XMM versions of above +XWORD0 = %xmm4 +XWORD1 = %xmm5 +XWORD2 = %xmm6 +XWORD3 = %xmm7 + +XTMP0 = %ymm0 +XTMP1 = %ymm1 +XTMP2 = %ymm2 +XTMP3 = %ymm3 +XTMP4 = %ymm8 +XFER = %ymm9 +XTMP5 = %ymm11 + +SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %ymm13 + +X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg +c = %ecx +d = %r8d +e = %edx # clobbers NUM_BLKS +y3 = %esi # clobbers INP + +SRND = CTX # SRND is same register as CTX + +a = %eax +b = %ebx +f = %r9d +g = %r10d +h = %r11d +old_h = %r11d + +T1 = %r12d +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round +_XMM_SAVE_SIZE = 0 +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_CTX_SIZE = 8 + +_XFER = 0 +_XMM_SAVE = _XFER + _XFER_SIZE +_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE +_INP = _INP_END + _INP_END_SIZE +_CTX = _INP + _INP_SIZE +STACK_SIZE = _CTX + _CTX_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs + X_ = X0 + X0 = X1 + X1 = X2 + X2 = X3 + X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED disp +################################### RND N + 0 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + vpsrld $7, XTMP1, XTMP2 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + vpslld $(32-7), XTMP1, XTMP3 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 + + vpsrld $18, XTMP1, XTMP2 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 1 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 1*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + vpslld $(32-18), XTMP1, XTMP1 + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + + vpxor XTMP1, XTMP3, XTMP3 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + + + ROTATE_ARGS + +################################### RND N + 2 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + offset = \disp + 2*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + rorx $11, e, y1 # y1 = e >> 11 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + and e, y2 # y2 = (f^g)&e # CH + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + vpxor XTMP3, XTMP2, XTMP2 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a ,T1 # T1 = (a >> 2) # S0 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1,h # h = k + w + h + S0 # -- + add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3,h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 3 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 3*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP3, XTMP2, XTMP2 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $2, a, T1 # T1 = (a >> 2) # S0 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + rotate_Xs +.endm + +.macro DO_4ROUNDS disp +################################### RND N + 0 ########################### + + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 1 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*1 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 2 ############################## + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*2 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 3 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*3 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + +.endm + +######################################################################## +## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_rorx) + ANNOTATE_NOENDBR # since this is called only via static_call + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + push %rbp + mov %rsp, %rbp + + subq $STACK_SIZE, %rsp + and $-32, %rsp # align rsp to 32 byte boundary + + shl $6, NUM_BLKS # convert to bytes + jz .Ldone_hash + lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block + mov NUM_BLKS, _INP_END(%rsp) + + cmp NUM_BLKS, INP + je .Lonly_one_block + + ## load initial digest + mov (CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + +.Lloop0: + ## Load first 16 dwords from two blocks + VMOVDQ 0*32(INP),XTMP0 + VMOVDQ 1*32(INP),XTMP1 + VMOVDQ 2*32(INP),XTMP2 + VMOVDQ 3*32(INP),XTMP3 + + ## byte swap data + vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 + vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 + vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 + vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 + + ## transpose data into high/low halves + vperm2i128 $0x20, XTMP2, XTMP0, X0 + vperm2i128 $0x31, XTMP2, XTMP0, X1 + vperm2i128 $0x20, XTMP3, XTMP1, X2 + vperm2i128 $0x31, XTMP3, XTMP1, X3 + +.Llast_block_enter: + add $64, INP + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 12 each + xor SRND, SRND + +.align 16 +.Lloop1: + leaq K256+0*32(%rip), INP ## reuse INP as scratch reg + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) + + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) + + leaq K256+2*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 2*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) + + leaq K256+3*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 3*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) + + add $4*32, SRND + cmp $3*4*32, SRND + jb .Lloop1 + +.Lloop2: + ## Do last 16 rounds with no scheduling + leaq K256+0*32(%rip), INP + vpaddd (INP, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + DO_4ROUNDS (_XFER + 0*32) + + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X1, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + DO_4ROUNDS (_XFER + 1*32) + add $2*32, SRND + + vmovdqa X2, X0 + vmovdqa X3, X1 + + cmp $4*4*32, SRND + jb .Lloop2 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + ja .Ldone_hash + + #### Do second block using previously scheduled results + xor SRND, SRND +.align 16 +.Lloop3: + DO_4ROUNDS (_XFER + 0*32 + 16) + DO_4ROUNDS (_XFER + 1*32 + 16) + add $2*32, SRND + cmp $4*4*32, SRND + jb .Lloop3 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + add $64, INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + jb .Lloop0 + ja .Ldone_hash + +.Ldo_last_block: + VMOVDQ 0*16(INP),XWORD0 + VMOVDQ 1*16(INP),XWORD1 + VMOVDQ 2*16(INP),XWORD2 + VMOVDQ 3*16(INP),XWORD3 + + vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 + vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 + vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 + vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 + + jmp .Llast_block_enter + +.Lonly_one_block: + + ## load initial digest + mov (4*0)(CTX),a + mov (4*1)(CTX),b + mov (4*2)(CTX),c + mov (4*3)(CTX),d + mov (4*4)(CTX),e + mov (4*5)(CTX),f + mov (4*6)(CTX),g + mov (4*7)(CTX),h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + jmp .Ldo_last_block + +.Ldone_hash: + + mov %rbp, %rsp + pop %rbp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + vzeroupper + RET +SYM_FUNC_END(sha256_transform_rorx) + +.section .rodata.cst512.K256, "aM", @progbits, 512 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 +.align 32 +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 +.align 32 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S new file mode 100644 index 000000000000..d3548206cf3d --- /dev/null +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -0,0 +1,196 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#define STATE_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 /* sha256rnds2 implicit operand */ +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define TMP %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +.macro do_4rounds i, m0, m1, m2, m3 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif + movdqa (\i-32)*4(SHA256CONSTANTS), MSG + paddd \m0, MSG + sha256rnds2 STATE0, STATE1 +.if \i >= 12 && \i < 60 + movdqa \m0, TMP + palignr $4, \m3, TMP + paddd TMP, \m1 + sha256msg2 \m0, \m1 +.endif + punpckhqdq MSG, MSG + sha256rnds2 STATE1, STATE0 +.if \i >= 4 && \i < 52 + sha256msg1 \m0, \m3 +.endif +.endm + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 block function + * + * This function takes a pointer to the current SHA-256 state, a pointer to the + * input data, and the number of 64-byte blocks to process. Once all blocks + * have been processed, the state is updated with the new state. This function + * only processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization is expected to be handled elsewhere. + * + * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], + * const u8 *data, size_t nblocks); + */ +.text +SYM_FUNC_START(sha256_ni_transform) + ANNOTATE_NOENDBR # since this is called only via static_call + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ + movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ + + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq TMP, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256+32*4(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + +.irp i, 0, 16, 32, 48 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 +.endr + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq TMP, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ + + movdqu STATE1, 0*16(STATE_PTR) + movdqu STATE0, 1*16(STATE_PTR) + +.Ldone_hash: + + RET +SYM_FUNC_END(sha256_ni_transform) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S new file mode 100644 index 000000000000..7f24a4cdcb25 --- /dev/null +++ b/lib/crypto/x86/sha256-ssse3-asm.S @@ -0,0 +1,511 @@ +######################################################################## +# Implement fast SHA-256 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include +#include + +## assume buffers not aligned +#define MOVDQ movdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + MOVDQ \p2, \p1 + pshufb \p3, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm12 + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg + +SRND = %rsi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %r12 +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + movdqa X3, XTMP0 + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + palignr $4, X2, XTMP0 # XTMP0 = W[-7] + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa X1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + palignr $4, X0, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + movdqa XTMP1, XTMP2 # XTMP2 = W[-15] + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH + movdqa XTMP1, XTMP3 # XTMP3 = W[-15] + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pslld $(32-7), XTMP1 # + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + psrld $7, XTMP2 # + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP3, XTMP2 # XTMP2 = W[-15] + mov e, y0 # y0 = e + mov a, y1 # y1 = a + movdqa XTMP3, XTMP4 # XTMP4 = W[-15] + ror $(25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(22-13), y1 # y1 = a >> (22-13) + pslld $(32-18), XTMP3 # + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + psrld $18, XTMP2 # + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pxor XTMP4, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} + mov e, y0 # y0 = e + mov a, y1 # y1 = a + ror $(25-11), y0 # y0 = e >> (25-11) + movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP2 + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + movdqa XTMP2, X0 # X0 = W[-2] {DDCC} + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 + xor g, y2 # y2 = CH = ((f^g)&e)^g + pxor XTMP3, XTMP2 # + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, X0 # X0 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_DC00, X0 # X0 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); +######################################################################## +.text +SYM_FUNC_START(sha256_transform_ssse3) + ANNOTATE_NOENDBR # since this is called only via static_call + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + mov %rsp, %rbp + + subq $STACK_SIZE, %rsp + and $~15, %rsp + + shl $6, NUM_BLKS # convert to bytes + jz .Ldone_hash + add INP, NUM_BLKS + mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + movdqa _SHUF_00BA(%rip), SHUF_00BA + movdqa _SHUF_DC00(%rip), SHUF_DC00 + +.Lloop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +.Lloop1: + movdqa (TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 1*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 2*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 3*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne .Lloop1 + + mov $2, SRND +.Lloop2: + paddd (TBL), X0 + movdqa X0, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd 1*16(TBL), X1 + movdqa X1, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X2, X0 + movdqa X3, X1 + + sub $1, SRND + jne .Lloop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne .Lloop0 + +.Ldone_hash: + + mov %rbp, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + + RET +SYM_FUNC_END(sha256_transform_ssse3) + +.section .rodata.cst256.K256, "aM", @progbits, 256 +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c new file mode 100644 index 000000000000..80380f8fdcee --- /dev/null +++ b/lib/crypto/x86/sha256.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include +#include + +asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); + +DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); + +void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_x86)) { + kernel_fpu_begin(); + static_call(sha256_blocks_x86)(state, data, nblocks); + kernel_fpu_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} +EXPORT_SYMBOL_GPL(sha256_blocks_simd); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_sha256_x86); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_x86_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha256_blocks_x86, sha256_ni_transform); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | + XFEATURE_MASK_YMM, NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha256_blocks_x86, + sha256_transform_rorx); + else + static_call_update(sha256_blocks_x86, + sha256_transform_avx); + } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { + return 0; + } + static_branch_enable(&have_sha256_x86); + return 0; +} +subsys_initcall(sha256_x86_mod_init); + +static void __exit sha256_x86_mod_exit(void) +{ +} +module_exit(sha256_x86_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); -- cgit v1.2.3 From a6d2f48b00f71590e1e4f174a88bd149516e6440 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 12:19:08 -0700 Subject: MAINTAINERS: Drop arch/*/lib/crypto/ pattern Since all files arch/*/lib/crypto/* have been moved into lib/crypto/, remove the arch/*/lib/crypto/ file pattern from MAINTAINERS. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4bac4ea21b64..dabd8f3fae9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6411,7 +6411,6 @@ L: linux-crypto@vger.kernel.org S: Maintained T: git https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git libcrypto-next T: git https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git libcrypto-fixes -F: arch/*/lib/crypto/ F: lib/crypto/ CRYPTO SPEED TEST COMPARE -- cgit v1.2.3 From 22375adaa0d9fbba9646c8e2b099c6e87c97bfae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 Jun 2025 15:55:35 -0700 Subject: lib/crypto: mips/chacha: Fix clang build and remove unneeded byteswap The MIPS32r2 ChaCha code has never been buildable with the clang assembler. First, clang doesn't support the 'rotl' pseudo-instruction: error: unknown instruction, did you mean: rol, rotr? Second, clang requires that both operands of the 'wsbh' instruction be explicitly given: error: too few operands for instruction To fix this, align the code with the real instruction set by (1) using the real instruction 'rotr' instead of the nonstandard pseudo- instruction 'rotl', and (2) explicitly giving both operands to 'wsbh'. To make removing the use of 'rotl' a bit easier, also remove the unnecessary special-casing for big endian CPUs at .Lchacha_mips_xor_bytes. The tail handling is actually endian-independent since it processes one byte at a time. On big endian CPUs the old code byte-swapped SAVED_X, then iterated through it in reverse order. But the byteswap and reverse iteration canceled out. Tested with chacha20poly1305-selftest in QEMU using "-M malta" with both little endian and big endian mips32r2 kernels. Fixes: 49aa7c00eddf ("crypto: mips/chacha - import 32r2 ChaCha code from Zinc") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505080409.EujEBwA0-lkp@intel.com/ Link: https://lore.kernel.org/r/20250619225535.679301-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/mips/chacha-core.S | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S index 5755f69cfe00..706aeb850fb0 100644 --- a/lib/crypto/mips/chacha-core.S +++ b/lib/crypto/mips/chacha-core.S @@ -55,17 +55,13 @@ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define MSB 0 #define LSB 3 -#define ROTx rotl -#define ROTR(n) rotr n, 24 #define CPU_TO_LE32(n) \ - wsbh n; \ + wsbh n, n; \ rotr n, 16; #else #define MSB 3 #define LSB 0 -#define ROTx rotr #define CPU_TO_LE32(n) -#define ROTR(n) #endif #define FOR_EACH_WORD(x) \ @@ -192,10 +188,10 @@ CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ xor X(W), X(B); \ xor X(Y), X(C); \ xor X(Z), X(D); \ - rotl X(V), S; \ - rotl X(W), S; \ - rotl X(Y), S; \ - rotl X(Z), S; + rotr X(V), 32 - S; \ + rotr X(W), 32 - S; \ + rotr X(Y), 32 - S; \ + rotr X(Z), 32 - S; .text .set reorder @@ -372,21 +368,19 @@ chacha_crypt_arch: /* First byte */ lbu T1, 0(IN) addiu $at, BYTES, 1 - CPU_TO_LE32(SAVED_X) - ROTR(SAVED_X) xor T1, SAVED_X sb T1, 0(OUT) beqz $at, .Lchacha_mips_xor_done /* Second byte */ lbu T1, 1(IN) addiu $at, BYTES, 2 - ROTx SAVED_X, 8 + rotr SAVED_X, 8 xor T1, SAVED_X sb T1, 1(OUT) beqz $at, .Lchacha_mips_xor_done /* Third byte */ lbu T1, 2(IN) - ROTx SAVED_X, 8 + rotr SAVED_X, 8 xor T1, SAVED_X sb T1, 2(OUT) b .Lchacha_mips_xor_done -- cgit v1.2.3 From 1cf5cdf8d2ae3a55cc4890c02feb5da18e423e5d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:32 -0700 Subject: libceph: Rename hmac_sha256() to ceph_hmac_sha256() Rename hmac_sha256() to ceph_hmac_sha256(), to avoid a naming conflict with the upcoming hmac_sha256() library function. This code will be able to use the HMAC-SHA256 library, but that's left for a later commit. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- net/ceph/messenger_v2.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index bd608ffa0627..5483b4eed94e 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -793,8 +793,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs, - int kvec_cnt, u8 *hmac) +static int ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, u8 *hmac) { SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ int ret; @@ -1462,8 +1462,8 @@ static int prepare_auth_signature(struct ceph_connection *con) if (!buf) return -ENOMEM; - ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, - CTRL_BODY(buf)); + ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, + con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); if (ret) return ret; @@ -2460,8 +2460,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); + ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, + con->v2.out_sign_kvec_cnt, hmac); if (ret) return ret; -- cgit v1.2.3 From 85de1929e17e71fafc51473ba1d14101e80180f5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:33 -0700 Subject: cxl/test: Simplify fw_buf_checksum_show() First, just use sha256() instead of a sequence of sha256_init(), sha256_update(), and sha256_final(). The result is the same. Second, use *phN instead of open-coding the conversion of bytes to hex. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- tools/testing/cxl/test/mem.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 0f1d91f57ba3..d533481672b7 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1828,27 +1828,10 @@ static ssize_t fw_buf_checksum_show(struct device *dev, { struct cxl_mockmem_data *mdata = dev_get_drvdata(dev); u8 hash[SHA256_DIGEST_SIZE]; - unsigned char *hstr, *hptr; - struct sha256_state sctx; - ssize_t written = 0; - int i; - - sha256_init(&sctx); - sha256_update(&sctx, mdata->fw, mdata->fw_size); - sha256_final(&sctx, hash); - - hstr = kzalloc((SHA256_DIGEST_SIZE * 2) + 1, GFP_KERNEL); - if (!hstr) - return -ENOMEM; - - hptr = hstr; - for (i = 0; i < SHA256_DIGEST_SIZE; i++) - hptr += sprintf(hptr, "%02x", hash[i]); - written = sysfs_emit(buf, "%s\n", hstr); + sha256(mdata->fw, mdata->fw_size, hash); - kfree(hstr); - return written; + return sysfs_emit(buf, "%*phN\n", SHA256_DIGEST_SIZE, hash); } static DEVICE_ATTR_RO(fw_buf_checksum); -- cgit v1.2.3 From 3135d5be7c27841526d98150c245304ab312e9f4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:34 -0700 Subject: lib/crypto: sha256: Reorder some code First, move the declarations of sha224_init/update/final to be just above the corresponding SHA-256 code, matching the order that I used for SHA-384 and SHA-512. In sha2.h, the end result is that SHA-224, SHA-256, SHA-384, and SHA-512 are all in the logical order. Second, move sha224_block_init() and sha256_block_init() to be just below crypto_sha256_state. In later changes, these functions as well as struct crypto_sha256_state will no longer be used by the library functions. They'll remain just for some legacy offload drivers. This gets them into a logical place in the file for that. No code changes other than reordering. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 60 +++++++++++++++++++++++++-------------------------- lib/crypto/sha256.c | 12 +++++------ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 296ce9d468bf..bb181b7996cd 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -71,6 +71,32 @@ struct crypto_sha256_state { u64 count; }; +static inline void sha224_block_init(struct crypto_sha256_state *sctx) +{ + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; +} + +static inline void sha256_block_init(struct crypto_sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; +} + struct sha256_state { union { struct crypto_sha256_state ctx; @@ -88,18 +114,12 @@ struct sha512_state { u8 buf[SHA512_BLOCK_SIZE]; }; -static inline void sha256_block_init(struct crypto_sha256_state *sctx) +static inline void sha224_init(struct sha256_state *sctx) { - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; + sha224_block_init(&sctx->ctx); } +/* Simply use sha256_update as it is equivalent to sha224_update. */ +void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]); static inline void sha256_init(struct sha256_state *sctx) { @@ -109,26 +129,6 @@ void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len); void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); -static inline void sha224_block_init(struct crypto_sha256_state *sctx) -{ - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; -} - -static inline void sha224_init(struct sha256_state *sctx) -{ - sha224_block_init(&sctx->ctx); -} -/* Simply use sha256_update as it is equivalent to sha224_update. */ -void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]); - /* State for the SHA-512 (and SHA-384) compression function */ struct sha512_block_state { u64 h[8]; diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 6bfa4ae8dfb5..573ccecbf48b 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -58,18 +58,18 @@ static inline void __sha256_final(struct sha256_state *sctx, u8 *out, memzero_explicit(sctx, sizeof(*sctx)); } -void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]) -{ - __sha256_final(sctx, out, SHA256_DIGEST_SIZE); -} -EXPORT_SYMBOL(sha256_final); - void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]) { __sha256_final(sctx, out, SHA224_DIGEST_SIZE); } EXPORT_SYMBOL(sha224_final); +void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]) +{ + __sha256_final(sctx, out, SHA256_DIGEST_SIZE); +} +EXPORT_SYMBOL(sha256_final); + void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) { struct sha256_state sctx; -- cgit v1.2.3 From 9f97707bdb1e479ea15e14e5525164f5f1128e97 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:35 -0700 Subject: lib/crypto: sha256: Remove sha256_blocks_simd() Instead of having both sha256_blocks_arch() and sha256_blocks_simd(), instead have just sha256_blocks_arch() which uses the most efficient implementation that is available in the calling context. This is simpler, as it reduces the API surface. It's also safer, since sha256_blocks_arch() just works in all contexts, including contexts where the FPU/SIMD/vector registers cannot be used. This doesn't mean that SHA-256 computations *should* be done in such contexts, but rather we should just do the right thing instead of corrupting a random task's registers. Eliminating this footgun and simplifying the code is well worth the very small performance cost of doing the check. Note: in the case of arm and arm64, what used to be sha256_blocks_arch() is renamed back to its original name of sha256_block_data_order(). sha256_blocks_arch() is now used for the higher-level dispatch function. This renaming also required an update to lib/crypto/arm64/sha512.h, since sha2-armv8.pl is shared by both SHA-256 and SHA-512. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/sha2.h | 6 ------ lib/crypto/Kconfig | 8 -------- lib/crypto/arm/Kconfig | 1 - lib/crypto/arm/sha256-armv4.pl | 20 ++++++++++---------- lib/crypto/arm/sha256.c | 14 +++++++------- lib/crypto/arm64/Kconfig | 1 - lib/crypto/arm64/sha2-armv8.pl | 2 +- lib/crypto/arm64/sha256.c | 14 +++++++------- lib/crypto/arm64/sha512.h | 6 +++--- lib/crypto/riscv/Kconfig | 1 - lib/crypto/riscv/sha256.c | 12 +++--------- lib/crypto/x86/Kconfig | 1 - lib/crypto/x86/sha256.c | 12 +++--------- 13 files changed, 34 insertions(+), 64 deletions(-) diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h index 21a27fd5e198..5a25ccc49388 100644 --- a/include/crypto/internal/sha2.h +++ b/include/crypto/internal/sha2.h @@ -3,7 +3,6 @@ #ifndef _CRYPTO_INTERNAL_SHA2_H #define _CRYPTO_INTERNAL_SHA2_H -#include #include #include #include @@ -22,8 +21,6 @@ void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); static __always_inline void sha256_choose_blocks( u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks, @@ -31,9 +28,6 @@ static __always_inline void sha256_choose_blocks( { if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic) sha256_blocks_generic(state, data, nblocks); - else if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD) && - (force_simd || crypto_simd_usable())) - sha256_blocks_simd(state, data, nblocks); else sha256_blocks_arch(state, data, nblocks); } diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 2460ddff967f..9bd740475a89 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -150,14 +150,6 @@ config CRYPTO_ARCH_HAVE_LIB_SHA256 Declares whether the architecture provides an arch-specific accelerated implementation of the SHA-256 library interface. -config CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the SHA-256 library interface - that is SIMD-based and therefore not usable in hardirq - context. - config CRYPTO_LIB_SHA256_GENERIC tristate default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index d1ad664f0c67..9f3ff30f4032 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -28,4 +28,3 @@ config CRYPTO_SHA256_ARM depends on !CPU_V7M default CRYPTO_LIB_SHA256 select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl index 8122db7fd599..f3a2b54efd4e 100644 --- a/lib/crypto/arm/sha256-armv4.pl +++ b/lib/crypto/arm/sha256-armv4.pl @@ -204,18 +204,18 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_blocks_arch +.word OPENSSL_armcap_P-sha256_block_data_order #endif .align 5 -.global sha256_blocks_arch -.type sha256_blocks_arch,%function -sha256_blocks_arch: -.Lsha256_blocks_arch: +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: #if __ARM_ARCH__<7 - sub r3,pc,#8 @ sha256_blocks_arch + sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,.Lsha256_blocks_arch + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap @@ -282,7 +282,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha256_blocks_arch,.-sha256_blocks_arch +.size sha256_block_data_order,.-sha256_block_data_order ___ ###################################################################### # NEON stuff @@ -470,8 +470,8 @@ sha256_block_data_order_neon: stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 - adr $Ktbl,.Lsha256_blocks_arch - sub $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256 + adr $Ktbl,.Lsha256_block_data_order + sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256 bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp mov sp,$H @ alloca diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c index 109192e54b0f..2c9cfdaaa069 100644 --- a/lib/crypto/arm/sha256.c +++ b/lib/crypto/arm/sha256.c @@ -6,12 +6,12 @@ */ #include #include +#include #include #include -asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -EXPORT_SYMBOL_GPL(sha256_blocks_arch); +asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], @@ -20,11 +20,11 @@ asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { + static_branch_likely(&have_neon) && crypto_simd_usable()) { kernel_neon_begin(); if (static_branch_likely(&have_ce)) sha256_ce_transform(state, data, nblocks); @@ -32,10 +32,10 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], sha256_block_data_order_neon(state, data, nblocks); kernel_neon_end(); } else { - sha256_blocks_arch(state, data, nblocks); + sha256_block_data_order(state, data, nblocks); } } -EXPORT_SYMBOL_GPL(sha256_blocks_simd); +EXPORT_SYMBOL_GPL(sha256_blocks_arch); bool sha256_is_arch_optimized(void) { diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig index 129a7685cb4c..49e57bfdb5b5 100644 --- a/lib/crypto/arm64/Kconfig +++ b/lib/crypto/arm64/Kconfig @@ -17,4 +17,3 @@ config CRYPTO_SHA256_ARM64 tristate default CRYPTO_LIB_SHA256 select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl index 4aebd20c498b..35ec9ae99fe1 100644 --- a/lib/crypto/arm64/sha2-armv8.pl +++ b/lib/crypto/arm64/sha2-armv8.pl @@ -95,7 +95,7 @@ if ($output =~ /512/) { $reg_t="w"; } -$func="sha${BITS}_blocks_arch"; +$func="sha${BITS}_block_data_order"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c index bcf7a3adc0c4..fb9bff40357b 100644 --- a/lib/crypto/arm64/sha256.c +++ b/lib/crypto/arm64/sha256.c @@ -6,12 +6,12 @@ */ #include #include +#include #include #include -asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -EXPORT_SYMBOL_GPL(sha256_blocks_arch); +asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], @@ -20,11 +20,11 @@ asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { + static_branch_likely(&have_neon) && crypto_simd_usable()) { if (static_branch_likely(&have_ce)) { do { size_t rem; @@ -42,10 +42,10 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], kernel_neon_end(); } } else { - sha256_blocks_arch(state, data, nblocks); + sha256_block_data_order(state, data, nblocks); } } -EXPORT_SYMBOL_GPL(sha256_blocks_simd); +EXPORT_SYMBOL_GPL(sha256_blocks_arch); bool sha256_is_arch_optimized(void) { diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h index eae14f9752e0..6abb40b467f2 100644 --- a/lib/crypto/arm64/sha512.h +++ b/lib/crypto/arm64/sha512.h @@ -11,8 +11,8 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); -asmlinkage void sha512_blocks_arch(struct sha512_block_state *state, - const u8 *data, size_t nblocks); +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, + const u8 *data, size_t nblocks); asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state, const u8 *data, size_t nblocks); @@ -32,7 +32,7 @@ static void sha512_blocks(struct sha512_block_state *state, nblocks = rem; } while (nblocks); } else { - sha512_blocks_arch(state, data, nblocks); + sha512_block_data_order(state, data, nblocks); } } diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig index 47c99ea97ce2..c100571feb7e 100644 --- a/lib/crypto/riscv/Kconfig +++ b/lib/crypto/riscv/Kconfig @@ -12,5 +12,4 @@ config CRYPTO_SHA256_RISCV64 depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO default CRYPTO_LIB_SHA256 select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c index 71808397dff4..aa77349d08f3 100644 --- a/lib/crypto/riscv/sha256.c +++ b/lib/crypto/riscv/sha256.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -19,10 +20,10 @@ asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb( static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_extensions)) { + if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { kernel_vector_begin(); sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); kernel_vector_end(); @@ -30,13 +31,6 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], sha256_blocks_generic(state, data, nblocks); } } -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} EXPORT_SYMBOL_GPL(sha256_blocks_arch); bool sha256_is_arch_optimized(void) diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index 5e94cdee492c..e344579db3d8 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -30,5 +30,4 @@ config CRYPTO_SHA256_X86_64 depends on 64BIT default CRYPTO_LIB_SHA256 select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c index 80380f8fdcee..baba74d7d26f 100644 --- a/lib/crypto/x86/sha256.c +++ b/lib/crypto/x86/sha256.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -23,10 +24,10 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_sha256_x86)) { + if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) { kernel_fpu_begin(); static_call(sha256_blocks_x86)(state, data, nblocks); kernel_fpu_end(); @@ -34,13 +35,6 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], sha256_blocks_generic(state, data, nblocks); } } -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} EXPORT_SYMBOL_GPL(sha256_blocks_arch); bool sha256_is_arch_optimized(void) -- cgit v1.2.3 From 6fa4b292204b15e0e269a9fd33bc99b5e36b6883 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:36 -0700 Subject: lib/crypto: sha256: Add sha224() and sha224_update() Add a one-shot SHA-224 computation function sha224(), for consistency with sha256(), sha384(), and sha512() which all already exist. Similarly, add sha224_update(). While for now it's identical to sha256_update(), omitting it makes the API harder to use since users have to "know" which functions are the same between SHA-224 and SHA-256. Also, this is a prerequisite for using different context types for each. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 10 ++++++++-- lib/crypto/sha256.c | 10 ++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index bb181b7996cd..e31da0743a52 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -114,18 +114,24 @@ struct sha512_state { u8 buf[SHA512_BLOCK_SIZE]; }; +void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len); + static inline void sha224_init(struct sha256_state *sctx) { sha224_block_init(&sctx->ctx); } -/* Simply use sha256_update as it is equivalent to sha224_update. */ +static inline void sha224_update(struct sha256_state *sctx, + const u8 *data, size_t len) +{ + sha256_update(sctx, data, len); +} void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]); +void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]); static inline void sha256_init(struct sha256_state *sctx) { sha256_block_init(&sctx->ctx); } -void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len); void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 573ccecbf48b..ccaae7088016 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -70,6 +70,16 @@ void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256_final); +void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]) +{ + struct sha256_state sctx; + + sha224_init(&sctx); + sha224_update(&sctx, data, len); + sha224_final(&sctx, out); +} +EXPORT_SYMBOL(sha224); + void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) { struct sha256_state sctx; -- cgit v1.2.3 From b86ced882b8e667758afddffd8d6354197842110 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:37 -0700 Subject: lib/crypto: sha256: Make library API use strongly-typed contexts Currently the SHA-224 and SHA-256 library functions can be mixed arbitrarily, even in ways that are incorrect, for example using sha224_init() and sha256_final(). This is because they operate on the same structure, sha256_state. Introduce stronger typing, as I did for SHA-384 and SHA-512. Also as I did for SHA-384 and SHA-512, use the names *_ctx instead of *_state. The *_ctx names have the following small benefits: - They're shorter. - They avoid an ambiguity with the compression function state. - They're consistent with the well-known OpenSSL API. - Users usually name the variable 'sctx' anyway, which suggests that *_ctx would be the more natural name for the actual struct. Therefore: update the SHA-224 and SHA-256 APIs, implementation, and calling code accordingly. In the new structs, also strongly-type the compression function state. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/riscv/purgatory/purgatory.c | 8 ++-- arch/s390/purgatory/purgatory.c | 2 +- arch/x86/purgatory/purgatory.c | 2 +- crypto/sha256.c | 16 +++---- drivers/char/tpm/tpm2-sessions.c | 12 ++--- include/crypto/sha2.h | 52 +++++++++++++++----- kernel/kexec_file.c | 10 ++-- lib/crypto/sha256.c | 100 ++++++++++++++++++++++++++++----------- 8 files changed, 139 insertions(+), 63 deletions(-) diff --git a/arch/riscv/purgatory/purgatory.c b/arch/riscv/purgatory/purgatory.c index 80596ab5fb62..bbd5cfa4d741 100644 --- a/arch/riscv/purgatory/purgatory.c +++ b/arch/riscv/purgatory/purgatory.c @@ -20,14 +20,14 @@ struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(".kex static int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; - struct sha256_state ss; + struct sha256_ctx sctx; u8 digest[SHA256_DIGEST_SIZE]; - sha256_init(&ss); + sha256_init(&sctx); end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); for (ptr = purgatory_sha_regions; ptr < end; ptr++) - sha256_update(&ss, (uint8_t *)(ptr->start), ptr->len); - sha256_final(&ss, digest); + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + sha256_final(&sctx, digest); if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)) != 0) return 1; return 0; diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c index 030efda05dbe..ecb38102187c 100644 --- a/arch/s390/purgatory/purgatory.c +++ b/arch/s390/purgatory/purgatory.c @@ -16,7 +16,7 @@ int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state sctx; + struct sha256_ctx sctx; sha256_init(&sctx); end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index aea47e793963..655139dd0532 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -25,7 +25,7 @@ static int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state sctx; + struct sha256_ctx sctx; sha256_init(&sctx); end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); diff --git a/crypto/sha256.c b/crypto/sha256.c index 4aeb213bab11..15c57fba256b 100644 --- a/crypto/sha256.c +++ b/crypto/sha256.c @@ -137,24 +137,24 @@ static int crypto_sha224_final_lib(struct shash_desc *desc, u8 *out) static int crypto_sha256_import_lib(struct shash_desc *desc, const void *in) { - struct sha256_state *sctx = shash_desc_ctx(desc); + struct __sha256_ctx *sctx = shash_desc_ctx(desc); const u8 *p = in; memcpy(sctx, p, sizeof(*sctx)); p += sizeof(*sctx); - sctx->count += *p; + sctx->bytecount += *p; return 0; } static int crypto_sha256_export_lib(struct shash_desc *desc, void *out) { - struct sha256_state *sctx0 = shash_desc_ctx(desc); - struct sha256_state sctx = *sctx0; + struct __sha256_ctx *sctx0 = shash_desc_ctx(desc); + struct __sha256_ctx sctx = *sctx0; unsigned int partial; u8 *p = out; - partial = sctx.count % SHA256_BLOCK_SIZE; - sctx.count -= partial; + partial = sctx.bytecount % SHA256_BLOCK_SIZE; + sctx.bytecount -= partial; memcpy(p, &sctx, sizeof(sctx)); p += sizeof(sctx); *p = partial; @@ -201,7 +201,7 @@ static struct shash_alg algs[] = { .update = crypto_sha256_update_lib, .final = crypto_sha256_final_lib, .digest = crypto_sha256_digest_lib, - .descsize = sizeof(struct sha256_state), + .descsize = sizeof(struct sha256_ctx), .statesize = sizeof(struct crypto_sha256_state) + SHA256_BLOCK_SIZE + 1, .import = crypto_sha256_import_lib, @@ -216,7 +216,7 @@ static struct shash_alg algs[] = { .init = crypto_sha224_init, .update = crypto_sha256_update_lib, .final = crypto_sha224_final_lib, - .descsize = sizeof(struct sha256_state), + .descsize = sizeof(struct sha224_ctx), .statesize = sizeof(struct crypto_sha256_state) + SHA256_BLOCK_SIZE + 1, .import = crypto_sha256_import_lib, diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index 7b5049b3d476..bdb119453dfb 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -390,7 +390,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, * on every operation, so we weld the hmac init and final functions in * here to give it the same usage characteristics as a regular hash */ -static void tpm2_hmac_init(struct sha256_state *sctx, u8 *key, u32 key_len) +static void tpm2_hmac_init(struct sha256_ctx *sctx, u8 *key, u32 key_len) { u8 pad[SHA256_BLOCK_SIZE]; int i; @@ -406,7 +406,7 @@ static void tpm2_hmac_init(struct sha256_state *sctx, u8 *key, u32 key_len) sha256_update(sctx, pad, sizeof(pad)); } -static void tpm2_hmac_final(struct sha256_state *sctx, u8 *key, u32 key_len, +static void tpm2_hmac_final(struct sha256_ctx *sctx, u8 *key, u32 key_len, u8 *out) { u8 pad[SHA256_BLOCK_SIZE]; @@ -440,7 +440,7 @@ static void tpm2_KDFa(u8 *key, u32 key_len, const char *label, u8 *u, const __be32 bits = cpu_to_be32(bytes * 8); while (bytes > 0) { - struct sha256_state sctx; + struct sha256_ctx sctx; __be32 c = cpu_to_be32(counter); tpm2_hmac_init(&sctx, key, key_len); @@ -467,7 +467,7 @@ static void tpm2_KDFa(u8 *key, u32 key_len, const char *label, u8 *u, static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v, u8 *out) { - struct sha256_state sctx; + struct sha256_ctx sctx; /* * this should be an iterative counter, but because we know * we're only taking 32 bytes for the point using a sha256 @@ -592,7 +592,7 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf) u8 *hmac = NULL; u32 attrs; u8 cphash[SHA256_DIGEST_SIZE]; - struct sha256_state sctx; + struct sha256_ctx sctx; if (!auth) return; @@ -750,7 +750,7 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, off_t offset_s, offset_p; u8 rphash[SHA256_DIGEST_SIZE]; u32 attrs, cc; - struct sha256_state sctx; + struct sha256_ctx sctx; u16 tag = be16_to_cpu(head->tag); int parm_len, len, i, handles; diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index e31da0743a52..18e1eec841b7 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -114,25 +114,55 @@ struct sha512_state { u8 buf[SHA512_BLOCK_SIZE]; }; -void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len); +/* State for the SHA-256 (and SHA-224) compression function */ +struct sha256_block_state { + u32 h[SHA256_STATE_WORDS]; +}; -static inline void sha224_init(struct sha256_state *sctx) -{ - sha224_block_init(&sctx->ctx); -} -static inline void sha224_update(struct sha256_state *sctx, +/* + * Context structure, shared by SHA-224 and SHA-256. The sha224_ctx and + * sha256_ctx structs wrap this one so that the API has proper typing and + * doesn't allow mixing the SHA-224 and SHA-256 functions arbitrarily. + */ +struct __sha256_ctx { + struct sha256_block_state state; + u64 bytecount; + u8 buf[SHA256_BLOCK_SIZE] __aligned(__alignof__(__be64)); +}; +void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len); + +/** + * struct sha224_ctx - Context for hashing a message with SHA-224 + * @ctx: private + */ +struct sha224_ctx { + struct __sha256_ctx ctx; +}; + +void sha224_init(struct sha224_ctx *ctx); +static inline void sha224_update(struct sha224_ctx *ctx, const u8 *data, size_t len) { - sha256_update(sctx, data, len); + __sha256_update(&ctx->ctx, data, len); } -void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]); +void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]); -static inline void sha256_init(struct sha256_state *sctx) +/** + * struct sha256_ctx - Context for hashing a message with SHA-256 + * @ctx: private + */ +struct sha256_ctx { + struct __sha256_ctx ctx; +}; + +void sha256_init(struct sha256_ctx *ctx); +static inline void sha256_update(struct sha256_ctx *ctx, + const u8 *data, size_t len) { - sha256_block_init(&sctx->ctx); + __sha256_update(&ctx->ctx, data, len); } -void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]); +void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); /* State for the SHA-512 (and SHA-384) compression function */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..b835033c65eb 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -751,7 +751,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct sha256_state state; + struct sha256_ctx sctx; int ret = 0, i, j, zero_buf_sz, sha_region_sz; size_t nullsz; u8 digest[SHA256_DIGEST_SIZE]; @@ -770,7 +770,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (!sha_regions) return -ENOMEM; - sha256_init(&state); + sha256_init(&sctx); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -796,7 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (check_ima_segment_index(image, i)) continue; - sha256_update(&state, ksegment->kbuf, ksegment->bufsz); + sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -808,7 +808,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - sha256_update(&state, zero_buf, bytes); + sha256_update(&sctx, zero_buf, bytes); nullsz -= bytes; } @@ -817,7 +817,7 @@ static int kexec_calculate_store_digests(struct kimage *image) j++; } - sha256_final(&state, digest); + sha256_final(&sctx, digest); ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", sha_regions, sha_region_sz, 0); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index ccaae7088016..3e7797a4489d 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -18,6 +18,20 @@ #include #include +static const struct sha256_block_state sha224_iv = { + .h = { + SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, + SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, + }, +}; + +static const struct sha256_block_state sha256_iv = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, +}; + /* * If __DISABLE_EXPORTS is defined, then this file is being compiled for a * pre-boot environment. In that case, ignore the kconfig options, pull the @@ -32,61 +46,93 @@ static inline bool sha256_purgatory(void) return __is_defined(__DISABLE_EXPORTS); } -static inline void sha256_blocks(u32 state[SHA256_STATE_WORDS], const u8 *data, - size_t nblocks) +static inline void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + sha256_choose_blocks(state->h, data, nblocks, sha256_purgatory(), false); +} + +static void __sha256_init(struct __sha256_ctx *ctx, + const struct sha256_block_state *iv, + u64 initial_bytecount) +{ + ctx->state = *iv; + ctx->bytecount = initial_bytecount; +} + +void sha224_init(struct sha224_ctx *ctx) +{ + __sha256_init(&ctx->ctx, &sha224_iv, 0); +} +EXPORT_SYMBOL_GPL(sha224_init); + +void sha256_init(struct sha256_ctx *ctx) { - sha256_choose_blocks(state, data, nblocks, sha256_purgatory(), false); + __sha256_init(&ctx->ctx, &sha256_iv, 0); } +EXPORT_SYMBOL_GPL(sha256_init); -void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len) +void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len) { - size_t partial = sctx->count % SHA256_BLOCK_SIZE; + size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; - sctx->count += len; - BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, sctx->ctx.state, data, len, - SHA256_BLOCK_SIZE, sctx->buf, partial); + ctx->bytecount += len; + BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, &ctx->state, data, len, + SHA256_BLOCK_SIZE, ctx->buf, partial); } -EXPORT_SYMBOL(sha256_update); +EXPORT_SYMBOL(__sha256_update); -static inline void __sha256_final(struct sha256_state *sctx, u8 *out, - size_t digest_size) +static void __sha256_final(struct __sha256_ctx *ctx, + u8 *out, size_t digest_size) { - size_t partial = sctx->count % SHA256_BLOCK_SIZE; + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA256_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial); + sha256_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial); + *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount); + sha256_blocks(&ctx->state, ctx->buf, 1); - sha256_finup(&sctx->ctx, sctx->buf, partial, out, digest_size, - sha256_purgatory(), false); - memzero_explicit(sctx, sizeof(*sctx)); + for (size_t i = 0; i < digest_size; i += 4) + put_unaligned_be32(ctx->state.h[i / 4], out + i); } -void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]) +void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]) { - __sha256_final(sctx, out, SHA224_DIGEST_SIZE); + __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL(sha224_final); -void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]) +void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]) { - __sha256_final(sctx, out, SHA256_DIGEST_SIZE); + __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL(sha256_final); void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]) { - struct sha256_state sctx; + struct sha224_ctx ctx; - sha224_init(&sctx); - sha224_update(&sctx, data, len); - sha224_final(&sctx, out); + sha224_init(&ctx); + sha224_update(&ctx, data, len); + sha224_final(&ctx, out); } EXPORT_SYMBOL(sha224); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) { - struct sha256_state sctx; + struct sha256_ctx ctx; - sha256_init(&sctx); - sha256_update(&sctx, data, len); - sha256_final(&sctx, out); + sha256_init(&ctx); + sha256_update(&ctx, data, len); + sha256_final(&ctx, out); } EXPORT_SYMBOL(sha256); -- cgit v1.2.3 From 4c855d5069ee2edbcf62fafc7f1a5d4cfea1bce1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:38 -0700 Subject: lib/crypto: sha256: Propagate sha256_block_state type to implementations The previous commit made the SHA-256 compression function state be strongly typed, but it wasn't propagated all the way down to the implementations of it. Do that now. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/octeon-sha256.c | 2 +- include/crypto/internal/sha2.h | 8 ++++---- lib/crypto/arm/sha256-ce.S | 2 +- lib/crypto/arm/sha256.c | 8 ++++---- lib/crypto/arm64/sha256-ce.S | 2 +- lib/crypto/arm64/sha256.c | 8 ++++---- lib/crypto/powerpc/sha256.c | 5 +++-- .../riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S | 2 +- lib/crypto/riscv/sha256.c | 7 ++++--- lib/crypto/s390/sha256.c | 2 +- lib/crypto/sha256-generic.c | 24 ++++++++++++++++------ lib/crypto/sparc/sha256.c | 4 ++-- lib/crypto/x86/sha256-avx-asm.S | 2 +- lib/crypto/x86/sha256-avx2-asm.S | 2 +- lib/crypto/x86/sha256-ni-asm.S | 2 +- lib/crypto/x86/sha256-ssse3-asm.S | 2 +- lib/crypto/x86/sha256.c | 10 ++++----- 17 files changed, 53 insertions(+), 39 deletions(-) diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c index c20038239cb6..f8664818d04e 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c @@ -22,7 +22,7 @@ * We pass everything as 64-bit. OCTEON can handle misaligned data. */ -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { struct octeon_cop2_state cop2_state; diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h index 5a25ccc49388..f0f455477bbd 100644 --- a/include/crypto/internal/sha2.h +++ b/include/crypto/internal/sha2.h @@ -17,9 +17,9 @@ static inline bool sha256_is_arch_optimized(void) return false; } #endif -void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_generic(struct sha256_block_state *state, const u8 *data, size_t nblocks); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks); static __always_inline void sha256_choose_blocks( @@ -27,9 +27,9 @@ static __always_inline void sha256_choose_blocks( bool force_generic, bool force_simd) { if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic) - sha256_blocks_generic(state, data, nblocks); + sha256_blocks_generic((struct sha256_block_state *)state, data, nblocks); else - sha256_blocks_arch(state, data, nblocks); + sha256_blocks_arch((struct sha256_block_state *)state, data, nblocks); } static __always_inline void sha256_finup( diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S index ac2c9b01b22d..7481ac8e6c0d 100644 --- a/lib/crypto/arm/sha256-ce.S +++ b/lib/crypto/arm/sha256-ce.S @@ -67,7 +67,7 @@ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 /* - * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + * void sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ ENTRY(sha256_ce_transform) diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c index 2c9cfdaaa069..7d9082358695 100644 --- a/lib/crypto/arm/sha256.c +++ b/lib/crypto/arm/sha256.c @@ -10,17 +10,17 @@ #include #include -asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S index f3e21c6d87d2..b99d9589c421 100644 --- a/lib/crypto/arm64/sha256-ce.S +++ b/lib/crypto/arm64/sha256-ce.S @@ -71,7 +71,7 @@ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 /* - * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], + * size_t __sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c index fb9bff40357b..609ffb815198 100644 --- a/lib/crypto/arm64/sha256.c +++ b/lib/crypto/arm64/sha256.c @@ -10,17 +10,17 @@ #include #include -asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_block_neon(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], +asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c index 6b0f079587eb..55f42403d572 100644 --- a/lib/crypto/powerpc/sha256.c +++ b/lib/crypto/powerpc/sha256.c @@ -26,7 +26,8 @@ */ #define MAX_BYTES 1024 -extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); +extern void ppc_spe_sha256_transform(struct sha256_block_state *state, + const u8 *src, u32 blocks); static void spe_begin(void) { @@ -42,7 +43,7 @@ static void spe_end(void) preempt_enable(); } -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { do { diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S index fad501ad0617..1618d1220a6e 100644 --- a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S +++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S @@ -106,7 +106,7 @@ sha256_4rounds \last, \k3, W3, W0, W1, W2 .endm -// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS], +// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, // const u8 *data, size_t nblocks); SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb) diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c index aa77349d08f3..a2079aa3ae92 100644 --- a/lib/crypto/riscv/sha256.c +++ b/lib/crypto/riscv/sha256.c @@ -15,12 +15,13 @@ #include #include -asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb( - u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); +asmlinkage void +sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, + const u8 *data, size_t nblocks); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c index 7dfe120fafab..fb565718f753 100644 --- a/lib/crypto/s390/sha256.c +++ b/lib/crypto/s390/sha256.c @@ -12,7 +12,7 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (static_branch_likely(&have_cpacf_sha256)) diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c index 2968d95d0403..99f904033c26 100644 --- a/lib/crypto/sha256-generic.c +++ b/lib/crypto/sha256-generic.c @@ -70,7 +70,7 @@ static inline void BLEND_OP(int I, u32 *W) h = t1 + t2; \ } while (0) -static void sha256_block_generic(u32 state[SHA256_STATE_WORDS], +static void sha256_block_generic(struct sha256_block_state *state, const u8 *input, u32 W[64]) { u32 a, b, c, d, e, f, g, h; @@ -101,8 +101,14 @@ static void sha256_block_generic(u32 state[SHA256_STATE_WORDS], } /* load the state into our registers */ - a = state[0]; b = state[1]; c = state[2]; d = state[3]; - e = state[4]; f = state[5]; g = state[6]; h = state[7]; + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + e = state->h[4]; + f = state->h[5]; + g = state->h[6]; + h = state->h[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { @@ -116,11 +122,17 @@ static void sha256_block_generic(u32 state[SHA256_STATE_WORDS], SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; + state->h[4] += e; + state->h[5] += f; + state->h[6] += g; + state->h[7] += h; } -void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_generic(struct sha256_block_state *state, const u8 *data, size_t nblocks) { u32 W[64]; diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c index 8bdec2db08b3..060664b88a6d 100644 --- a/lib/crypto/sparc/sha256.c +++ b/lib/crypto/sparc/sha256.c @@ -19,10 +19,10 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); -asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (static_branch_likely(&have_sha256_opcodes)) diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S index 0d7b2c3e45d9..73bcff2b548f 100644 --- a/lib/crypto/x86/sha256-avx-asm.S +++ b/lib/crypto/x86/sha256-avx-asm.S @@ -341,7 +341,7 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], +## void sha256_transform_avx(struct sha256_block_state *state, ## const u8 *data, size_t nblocks); ######################################################################## .text diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S index 25d3380321ec..45787570387f 100644 --- a/lib/crypto/x86/sha256-avx2-asm.S +++ b/lib/crypto/x86/sha256-avx2-asm.S @@ -518,7 +518,7 @@ STACK_SIZE = _CTX + _CTX_SIZE .endm ######################################################################## -## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], +## void sha256_transform_rorx(struct sha256_block_state *state, ## const u8 *data, size_t nblocks); ######################################################################## .text diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index d3548206cf3d..4af7d22e29e4 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -106,7 +106,7 @@ * only processes complete blocks. State initialization, buffering of partial * blocks, and digest finalization is expected to be handled elsewhere. * - * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], + * void sha256_ni_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S index 7f24a4cdcb25..407b30adcd37 100644 --- a/lib/crypto/x86/sha256-ssse3-asm.S +++ b/lib/crypto/x86/sha256-ssse3-asm.S @@ -348,7 +348,7 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], +## void sha256_transform_ssse3(struct sha256_block_state *state, ## const u8 *data, size_t nblocks); ######################################################################## .text diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c index baba74d7d26f..cbb45defbefa 100644 --- a/lib/crypto/x86/sha256.c +++ b/lib/crypto/x86/sha256.c @@ -11,20 +11,20 @@ #include #include -asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_transform_avx(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_transform_rorx(struct sha256_block_state *state, const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], +asmlinkage void sha256_ni_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], +void sha256_blocks_arch(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) { -- cgit v1.2.3 From 077833cd600908359391bd22d5350c9106ea238c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:39 -0700 Subject: lib/crypto: sha256: Add HMAC-SHA224 and HMAC-SHA256 support Since HMAC support is commonly needed and is fairly simple, include it as a first-class citizen of the SHA-256 library. The API supports both incremental and one-shot computation, and either preparing the key ahead of time or just using a raw key. The implementation is much more streamlined than crypto/hmac.c. I've kept it consistent with the HMAC-SHA384 and HMAC-SHA512 code as much as possible. Testing of these functions will be via sha224_kunit and sha256_kunit, added by a later commit. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/crypto/sha256.c | 147 +++++++++++++++++++++++++++++++-- 2 files changed, 364 insertions(+), 5 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 18e1eec841b7..2e3fc2cf4aa0 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -131,6 +131,22 @@ struct __sha256_ctx { }; void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len); +/* + * HMAC key and message context structs, shared by HMAC-SHA224 and HMAC-SHA256. + * The hmac_sha224_* and hmac_sha256_* structs wrap this one so that the API has + * proper typing and doesn't allow mixing the functions arbitrarily. + */ +struct __hmac_sha256_key { + struct sha256_block_state istate; + struct sha256_block_state ostate; +}; +struct __hmac_sha256_ctx { + struct __sha256_ctx sha_ctx; + struct sha256_block_state ostate; +}; +void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx, + const struct __hmac_sha256_key *key); + /** * struct sha224_ctx - Context for hashing a message with SHA-224 * @ctx: private @@ -148,6 +164,109 @@ static inline void sha224_update(struct sha224_ctx *ctx, void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]); +/** + * struct hmac_sha224_key - Prepared key for HMAC-SHA224 + * @key: private + */ +struct hmac_sha224_key { + struct __hmac_sha256_key key; +}; + +/** + * struct hmac_sha224_ctx - Context for computing HMAC-SHA224 of a message + * @ctx: private + */ +struct hmac_sha224_ctx { + struct __hmac_sha256_ctx ctx; +}; + +/** + * hmac_sha224_preparekey() - Prepare a key for HMAC-SHA224 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-SHA224 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_sha224_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_sha224_preparekey(struct hmac_sha224_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha224_init() - Initialize an HMAC-SHA224 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_sha224() instead. + * + * Context: Any context. + */ +static inline void hmac_sha224_init(struct hmac_sha224_ctx *ctx, + const struct hmac_sha224_key *key) +{ + __hmac_sha256_init(&ctx->ctx, &key->key); +} + +/** + * hmac_sha224_update() - Update an HMAC-SHA224 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx, + const u8 *data, size_t data_len) +{ + __sha256_update(&ctx->ctx.sha_ctx, data, data_len); +} + +/** + * hmac_sha224_final() - Finish computing an HMAC-SHA224 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-SHA224 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); + +/** + * hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA224 value + * + * If you're using the key only once, consider using hmac_sha224_usingrawkey(). + * + * Context: Any context. + */ +void hmac_sha224(const struct hmac_sha224_key *key, + const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]); + +/** + * hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key + * @raw_key: the raw HMAC-SHA224 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA224 value + * + * If you're using the key multiple times, prefer to use + * hmac_sha224_preparekey() followed by multiple calls to hmac_sha224() instead. + * + * Context: Any context. + */ +void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA224_DIGEST_SIZE]); + /** * struct sha256_ctx - Context for hashing a message with SHA-256 * @ctx: private @@ -165,6 +284,109 @@ static inline void sha256_update(struct sha256_ctx *ctx, void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); +/** + * struct hmac_sha256_key - Prepared key for HMAC-SHA256 + * @key: private + */ +struct hmac_sha256_key { + struct __hmac_sha256_key key; +}; + +/** + * struct hmac_sha256_ctx - Context for computing HMAC-SHA256 of a message + * @ctx: private + */ +struct hmac_sha256_ctx { + struct __hmac_sha256_ctx ctx; +}; + +/** + * hmac_sha256_preparekey() - Prepare a key for HMAC-SHA256 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-SHA256 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_sha256_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_sha256_preparekey(struct hmac_sha256_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha256_init() - Initialize an HMAC-SHA256 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_sha256() instead. + * + * Context: Any context. + */ +static inline void hmac_sha256_init(struct hmac_sha256_ctx *ctx, + const struct hmac_sha256_key *key) +{ + __hmac_sha256_init(&ctx->ctx, &key->key); +} + +/** + * hmac_sha256_update() - Update an HMAC-SHA256 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx, + const u8 *data, size_t data_len) +{ + __sha256_update(&ctx->ctx.sha_ctx, data, data_len); +} + +/** + * hmac_sha256_final() - Finish computing an HMAC-SHA256 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-SHA256 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); + +/** + * hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA256 value + * + * If you're using the key only once, consider using hmac_sha256_usingrawkey(). + * + * Context: Any context. + */ +void hmac_sha256(const struct hmac_sha256_key *key, + const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]); + +/** + * hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key + * @raw_key: the raw HMAC-SHA256 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA256 value + * + * If you're using the key multiple times, prefer to use + * hmac_sha256_preparekey() followed by multiple calls to hmac_sha256() instead. + * + * Context: Any context. + */ +void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA256_DIGEST_SIZE]); + /* State for the SHA-512 (and SHA-384) compression function */ struct sha512_block_state { u64 h[8]; diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 3e7797a4489d..12b4b59052c4 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke . + * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions * * Copyright (c) Jean-Luc Cooke * Copyright (c) Andrew McDonald @@ -11,12 +8,14 @@ * Copyright (c) 2014 Red Hat Inc. */ +#include #include #include #include #include #include #include +#include static const struct sha256_block_state sha224_iv = { .h = { @@ -136,5 +135,143 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -MODULE_DESCRIPTION("SHA-256 Algorithm"); +/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +#ifndef __DISABLE_EXPORTS +static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key, + const u8 *raw_key, size_t raw_key_len, + const struct sha256_block_state *iv) +{ + union { + u8 b[SHA256_BLOCK_SIZE]; + unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) { + if (iv == &sha224_iv) + sha224(raw_key, raw_key_len, derived_key.b); + else + sha256(raw_key, raw_key_len, derived_key.b); + } else { + memcpy(derived_key.b, raw_key, raw_key_len); + } + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + key->istate = *iv; + sha256_blocks(&key->istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + key->ostate = *iv; + sha256_blocks(&key->ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha224_preparekey(struct hmac_sha224_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha224_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha224_preparekey); + +void hmac_sha256_preparekey(struct hmac_sha256_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha256_iv); +} +EXPORT_SYMBOL_GPL(hmac_sha256_preparekey); + +void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx, + const struct __hmac_sha256_key *key) +{ + __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE); + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(__hmac_sha256_init); + +static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx, + u8 *out, size_t digest_size) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size); + memset(&ctx->sha_ctx.buf[digest_size], 0, + SHA256_BLOCK_SIZE - digest_size); + ctx->sha_ctx.buf[digest_size] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < digest_size; i += 4) + put_unaligned_be32(ctx->ostate.h[i / 4], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +void hmac_sha224_final(struct hmac_sha224_ctx *ctx, + u8 out[SHA224_DIGEST_SIZE]) +{ + __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha224_final); + +void hmac_sha256_final(struct hmac_sha256_ctx *ctx, + u8 out[SHA256_DIGEST_SIZE]) +{ + __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); +} +EXPORT_SYMBOL_GPL(hmac_sha256_final); + +void hmac_sha224(const struct hmac_sha224_key *key, + const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]) +{ + struct hmac_sha224_ctx ctx; + + hmac_sha224_init(&ctx, key); + hmac_sha224_update(&ctx, data, data_len); + hmac_sha224_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha224); + +void hmac_sha256(const struct hmac_sha256_key *key, + const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]) +{ + struct hmac_sha256_ctx ctx; + + hmac_sha256_init(&ctx, key); + hmac_sha256_update(&ctx, data, data_len); + hmac_sha256_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha256); + +void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA224_DIGEST_SIZE]) +{ + struct hmac_sha224_key key; + + hmac_sha224_preparekey(&key, raw_key, raw_key_len); + hmac_sha224(&key, data, data_len, out); + + memzero_explicit(&key, sizeof(key)); +} +EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey); + +void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA256_DIGEST_SIZE]) +{ + struct hmac_sha256_key key; + + hmac_sha256_preparekey(&key, raw_key, raw_key_len); + hmac_sha256(&key, data, data_len, out); + + memzero_explicit(&key, sizeof(key)); +} +EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey); +#endif /* !__DISABLE_EXPORTS */ + +MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e0cd3716910385ba1ccbd433c860516cf806fc71 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:40 -0700 Subject: crypto: sha256 - Wrap library and add HMAC support Like I did for crypto/sha512.c, rework crypto/sha256.c to simply wrap the normal library functions instead of accessing the low-level arch- optimized and generic block functions directly. Also add support for HMAC-SHA224 and HMAC-SHA256, again just wrapping the library functions. Since the replacement crypto_shash algorithms are implemented using the (potentially arch-optimized) library functions, give them driver names ending with "-lib" rather than "-generic". Update crypto/testmgr.c and a couple odd drivers to take this change in driver name into account. Besides the above cases which are accounted for, there are no known cases where the driver names were being depended on. There is potential for confusion for people manually checking /proc/crypto (e.g. https://lore.kernel.org/r/9e33c893-2466-4d4e-afb1-966334e451a2@linux.ibm.com/), but really people just need to get used to the driver name not being meaningful for the software algorithms. Historically, the optimized code was disabled by default, so there was some purpose to checking whether it was enabled or not. However, this is now fixed for all SHA-2 algorithms, and the library code just always does the right thing. E.g. if the CPU supports SHA-256 instructions, they are used. This change does also mean that the generic partial block handling code in crypto/shash.c, which got added in 6.16, no longer gets used. But that's fine; the library has to implement the partial block handling anyway, and it's better to do it in the library since the block size and other properties of the algorithm are all fixed at compile time there, resulting in more streamlined code. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 4 +- crypto/Makefile | 1 - crypto/sha256.c | 286 +++++++++++++++------------------- crypto/testmgr.c | 12 ++ drivers/crypto/img-hash.c | 4 +- drivers/crypto/starfive/jh7110-hash.c | 8 +- 6 files changed, 148 insertions(+), 167 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index cb40a9b46972..3ea1397214e0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -992,9 +992,9 @@ config CRYPTO_SHA256 tristate "SHA-224 and SHA-256" select CRYPTO_HASH select CRYPTO_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180, ISO/IEC 10118-3) + SHA-224 and SHA-256 secure hash algorithms (FIPS 180, ISO/IEC + 10118-3), including HMAC support. This is required for IPsec AH (XFRM_AH) and IPsec ESP (XFRM_ESP). Used by the btrfs filesystem, Ceph, NFS, and SMB. diff --git a/crypto/Makefile b/crypto/Makefile index 271c77462cec..5098fa6d5f39 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -77,7 +77,6 @@ obj-$(CONFIG_CRYPTO_MD5) += md5.o obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o obj-$(CONFIG_CRYPTO_SHA256) += sha256.o -CFLAGS_sha256.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_SHA512) += sha512.o obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o diff --git a/crypto/sha256.c b/crypto/sha256.c index 15c57fba256b..d81166cbba95 100644 --- a/crypto/sha256.c +++ b/crypto/sha256.c @@ -1,17 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Crypto API wrapper for the SHA-256 and SHA-224 library functions + * Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 * * Copyright (c) Jean-Luc Cooke * Copyright (c) Andrew McDonald * Copyright (c) 2002 James Morris * SHA224 Support Copyright 2007 Intel Corporation + * Copyright 2025 Google LLC */ #include -#include +#include #include #include +/* SHA-224 */ + const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, @@ -20,6 +23,36 @@ const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); +#define SHA224_CTX(desc) ((struct sha224_ctx *)shash_desc_ctx(desc)) + +static int crypto_sha224_init(struct shash_desc *desc) +{ + sha224_init(SHA224_CTX(desc)); + return 0; +} + +static int crypto_sha224_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + sha224_update(SHA224_CTX(desc), data, len); + return 0; +} + +static int crypto_sha224_final(struct shash_desc *desc, u8 *out) +{ + sha224_final(SHA224_CTX(desc), out); + return 0; +} + +static int crypto_sha224_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + sha224(data, len, out); + return 0; +} + +/* SHA-256 */ + const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, @@ -28,256 +61,193 @@ const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); +#define SHA256_CTX(desc) ((struct sha256_ctx *)shash_desc_ctx(desc)) + static int crypto_sha256_init(struct shash_desc *desc) { - sha256_block_init(shash_desc_ctx(desc)); + sha256_init(SHA256_CTX(desc)); return 0; } -static inline int crypto_sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, bool force_generic) +static int crypto_sha256_update(struct shash_desc *desc, + const u8 *data, unsigned int len) { - struct crypto_sha256_state *sctx = shash_desc_ctx(desc); - int remain = len % SHA256_BLOCK_SIZE; - - sctx->count += len - remain; - sha256_choose_blocks(sctx->state, data, len / SHA256_BLOCK_SIZE, - force_generic, !force_generic); - return remain; + sha256_update(SHA256_CTX(desc), data, len); + return 0; } -static int crypto_sha256_update_generic(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int crypto_sha256_final(struct shash_desc *desc, u8 *out) { - return crypto_sha256_update(desc, data, len, true); + sha256_final(SHA256_CTX(desc), out); + return 0; } -static int crypto_sha256_update_lib(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int crypto_sha256_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) { - sha256_update(shash_desc_ctx(desc), data, len); + sha256(data, len, out); return 0; } -static int crypto_sha256_update_arch(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return crypto_sha256_update(desc, data, len, false); -} +/* HMAC-SHA224 */ -static int crypto_sha256_final_lib(struct shash_desc *desc, u8 *out) -{ - sha256_final(shash_desc_ctx(desc), out); - return 0; -} +#define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm)) +#define HMAC_SHA224_CTX(desc) ((struct hmac_sha224_ctx *)shash_desc_ctx(desc)) -static __always_inline int crypto_sha256_finup(struct shash_desc *desc, - const u8 *data, - unsigned int len, u8 *out, - bool force_generic) +static int crypto_hmac_sha224_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) { - struct crypto_sha256_state *sctx = shash_desc_ctx(desc); - unsigned int remain = len; - u8 *buf; - - if (len >= SHA256_BLOCK_SIZE) - remain = crypto_sha256_update(desc, data, len, force_generic); - sctx->count += remain; - buf = memcpy(sctx + 1, data + len - remain, remain); - sha256_finup(sctx, buf, remain, out, - crypto_shash_digestsize(desc->tfm), force_generic, - !force_generic); + hmac_sha224_preparekey(HMAC_SHA224_KEY(tfm), raw_key, keylen); return 0; } -static int crypto_sha256_finup_generic(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int crypto_hmac_sha224_init(struct shash_desc *desc) { - return crypto_sha256_finup(desc, data, len, out, true); + hmac_sha224_init(HMAC_SHA224_CTX(desc), HMAC_SHA224_KEY(desc->tfm)); + return 0; } -static int crypto_sha256_finup_arch(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int crypto_hmac_sha224_update(struct shash_desc *desc, + const u8 *data, unsigned int len) { - return crypto_sha256_finup(desc, data, len, out, false); + hmac_sha224_update(HMAC_SHA224_CTX(desc), data, len); + return 0; } -static int crypto_sha256_digest_generic(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int crypto_hmac_sha224_final(struct shash_desc *desc, u8 *out) { - crypto_sha256_init(desc); - return crypto_sha256_finup_generic(desc, data, len, out); + hmac_sha224_final(HMAC_SHA224_CTX(desc), out); + return 0; } -static int crypto_sha256_digest_lib(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int crypto_hmac_sha224_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, + u8 *out) { - sha256(data, len, out); + hmac_sha224(HMAC_SHA224_KEY(desc->tfm), data, len, out); return 0; } -static int crypto_sha256_digest_arch(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +/* HMAC-SHA256 */ + +#define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm)) +#define HMAC_SHA256_CTX(desc) ((struct hmac_sha256_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_sha256_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) { - crypto_sha256_init(desc); - return crypto_sha256_finup_arch(desc, data, len, out); + hmac_sha256_preparekey(HMAC_SHA256_KEY(tfm), raw_key, keylen); + return 0; } -static int crypto_sha224_init(struct shash_desc *desc) +static int crypto_hmac_sha256_init(struct shash_desc *desc) { - sha224_block_init(shash_desc_ctx(desc)); + hmac_sha256_init(HMAC_SHA256_CTX(desc), HMAC_SHA256_KEY(desc->tfm)); return 0; } -static int crypto_sha224_final_lib(struct shash_desc *desc, u8 *out) +static int crypto_hmac_sha256_update(struct shash_desc *desc, + const u8 *data, unsigned int len) { - sha224_final(shash_desc_ctx(desc), out); + hmac_sha256_update(HMAC_SHA256_CTX(desc), data, len); return 0; } -static int crypto_sha256_import_lib(struct shash_desc *desc, const void *in) +static int crypto_hmac_sha256_final(struct shash_desc *desc, u8 *out) { - struct __sha256_ctx *sctx = shash_desc_ctx(desc); - const u8 *p = in; - - memcpy(sctx, p, sizeof(*sctx)); - p += sizeof(*sctx); - sctx->bytecount += *p; + hmac_sha256_final(HMAC_SHA256_CTX(desc), out); return 0; } -static int crypto_sha256_export_lib(struct shash_desc *desc, void *out) +static int crypto_hmac_sha256_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, + u8 *out) { - struct __sha256_ctx *sctx0 = shash_desc_ctx(desc); - struct __sha256_ctx sctx = *sctx0; - unsigned int partial; - u8 *p = out; - - partial = sctx.bytecount % SHA256_BLOCK_SIZE; - sctx.bytecount -= partial; - memcpy(p, &sctx, sizeof(sctx)); - p += sizeof(sctx); - *p = partial; + hmac_sha256(HMAC_SHA256_KEY(desc->tfm), data, len, out); return 0; } +/* Algorithm definitions */ + static struct shash_alg algs[] = { - { - .base.cra_name = "sha256", - .base.cra_driver_name = "sha256-generic", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA256_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .digestsize = SHA256_DIGEST_SIZE, - .init = crypto_sha256_init, - .update = crypto_sha256_update_generic, - .finup = crypto_sha256_finup_generic, - .digest = crypto_sha256_digest_generic, - .descsize = sizeof(struct crypto_sha256_state), - }, { .base.cra_name = "sha224", - .base.cra_driver_name = "sha224-generic", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, + .base.cra_driver_name = "sha224-lib", + .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, .init = crypto_sha224_init, - .update = crypto_sha256_update_generic, - .finup = crypto_sha256_finup_generic, - .descsize = sizeof(struct crypto_sha256_state), + .update = crypto_sha224_update, + .final = crypto_sha224_final, + .digest = crypto_sha224_digest, + .descsize = sizeof(struct sha224_ctx), }, { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-lib", + .base.cra_priority = 300, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, .init = crypto_sha256_init, - .update = crypto_sha256_update_lib, - .final = crypto_sha256_final_lib, - .digest = crypto_sha256_digest_lib, + .update = crypto_sha256_update, + .final = crypto_sha256_final, + .digest = crypto_sha256_digest, .descsize = sizeof(struct sha256_ctx), - .statesize = sizeof(struct crypto_sha256_state) + - SHA256_BLOCK_SIZE + 1, - .import = crypto_sha256_import_lib, - .export = crypto_sha256_export_lib, }, { - .base.cra_name = "sha224", - .base.cra_driver_name = "sha224-lib", + .base.cra_name = "hmac(sha224)", + .base.cra_driver_name = "hmac-sha224-lib", + .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_sha224_key), .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, - .init = crypto_sha224_init, - .update = crypto_sha256_update_lib, - .final = crypto_sha224_final_lib, - .descsize = sizeof(struct sha224_ctx), - .statesize = sizeof(struct crypto_sha256_state) + - SHA256_BLOCK_SIZE + 1, - .import = crypto_sha256_import_lib, - .export = crypto_sha256_export_lib, + .setkey = crypto_hmac_sha224_setkey, + .init = crypto_hmac_sha224_init, + .update = crypto_hmac_sha224_update, + .final = crypto_hmac_sha224_final, + .digest = crypto_hmac_sha224_digest, + .descsize = sizeof(struct hmac_sha224_ctx), }, { - .base.cra_name = "sha256", - .base.cra_driver_name = "sha256-" __stringify(ARCH), + .base.cra_name = "hmac(sha256)", + .base.cra_driver_name = "hmac-sha256-lib", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SHA256_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_sha256_key), .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, - .init = crypto_sha256_init, - .update = crypto_sha256_update_arch, - .finup = crypto_sha256_finup_arch, - .digest = crypto_sha256_digest_arch, - .descsize = sizeof(struct crypto_sha256_state), - }, - { - .base.cra_name = "sha224", - .base.cra_driver_name = "sha224-" __stringify(ARCH), - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA224_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - .digestsize = SHA224_DIGEST_SIZE, - .init = crypto_sha224_init, - .update = crypto_sha256_update_arch, - .finup = crypto_sha256_finup_arch, - .descsize = sizeof(struct crypto_sha256_state), + .setkey = crypto_hmac_sha256_setkey, + .init = crypto_hmac_sha256_init, + .update = crypto_hmac_sha256_update, + .final = crypto_hmac_sha256_final, + .digest = crypto_hmac_sha256_digest, + .descsize = sizeof(struct hmac_sha256_ctx), }, }; -static unsigned int num_algs; - static int __init crypto_sha256_mod_init(void) { - /* register the arch flavours only if they differ from generic */ - num_algs = ARRAY_SIZE(algs); - BUILD_BUG_ON(ARRAY_SIZE(algs) <= 2); - if (!sha256_is_arch_optimized()) - num_algs -= 2; return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } module_init(crypto_sha256_mod_init); static void __exit crypto_sha256_mod_exit(void) { - crypto_unregister_shashes(algs, num_algs); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_exit(crypto_sha256_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Crypto API wrapper for the SHA-256 and SHA-224 library functions"); +MODULE_DESCRIPTION("Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256"); -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha256-generic"); -MODULE_ALIAS_CRYPTO("sha256-" __stringify(ARCH)); MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha224-generic"); -MODULE_ALIAS_CRYPTO("sha224-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("sha224-lib"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha256-lib"); +MODULE_ALIAS_CRYPTO("hmac(sha224)"); +MODULE_ALIAS_CRYPTO("hmac-sha224-lib"); +MODULE_ALIAS_CRYPTO("hmac(sha256)"); +MODULE_ALIAS_CRYPTO("hmac-sha256-lib"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 9d8b11ea4af7..4e95567f7ed1 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4270,18 +4270,21 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha224),cbc(des))", + .generic_driver = "authenc(hmac-sha224-lib,cbc(des-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha224_des_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha224),cbc(des3_ede))", + .generic_driver = "authenc(hmac-sha224-lib,cbc(des3_ede-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha224_des3_ede_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha256),cbc(aes))", + .generic_driver = "authenc(hmac-sha256-lib,cbc(aes-generic))", .test = alg_test_aead, .fips_allowed = 1, .suite = { @@ -4289,12 +4292,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(sha256),cbc(des))", + .generic_driver = "authenc(hmac-sha256-lib,cbc(des-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha256_des_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha256),cbc(des3_ede))", + .generic_driver = "authenc(hmac-sha256-lib,cbc(des3_ede-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha256_des3_ede_cbc_tv_temp) @@ -4305,6 +4310,7 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha256),cts(cbc(aes)))", + .generic_driver = "authenc(hmac-sha256-lib,cts(cbc(aes-generic)))", .test = alg_test_aead, .suite = { .aead = __VECS(krb5_test_aes128_cts_hmac_sha256_128) @@ -5015,6 +5021,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "essiv(authenc(hmac(sha256),cbc(aes)),sha256)", + .generic_driver = "essiv(authenc(hmac-sha256-lib,cbc(aes-generic)),sha256-lib)", .test = alg_test_aead, .fips_allowed = 1, .suite = { @@ -5022,6 +5029,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "essiv(cbc(aes),sha256)", + .generic_driver = "essiv(cbc(aes-generic),sha256-lib)", .test = alg_test_skcipher, .fips_allowed = 1, .suite = { @@ -5121,6 +5129,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(sha224)", + .generic_driver = "hmac-sha224-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5128,6 +5137,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(sha256)", + .generic_driver = "hmac-sha256-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5459,6 +5469,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "sha224", + .generic_driver = "sha224-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5466,6 +5477,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "sha256", + .generic_driver = "sha256-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index e050f5ff5efb..f312eb075fec 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -710,12 +710,12 @@ static int img_hash_cra_sha1_init(struct crypto_tfm *tfm) static int img_hash_cra_sha224_init(struct crypto_tfm *tfm) { - return img_hash_cra_init(tfm, "sha224-generic"); + return img_hash_cra_init(tfm, "sha224-lib"); } static int img_hash_cra_sha256_init(struct crypto_tfm *tfm) { - return img_hash_cra_init(tfm, "sha256-generic"); + return img_hash_cra_init(tfm, "sha256-lib"); } static void img_hash_cra_exit(struct crypto_tfm *tfm) diff --git a/drivers/crypto/starfive/jh7110-hash.c b/drivers/crypto/starfive/jh7110-hash.c index 4abbff07412f..6cfe0238f615 100644 --- a/drivers/crypto/starfive/jh7110-hash.c +++ b/drivers/crypto/starfive/jh7110-hash.c @@ -493,13 +493,13 @@ static int starfive_hash_setkey(struct crypto_ahash *hash, static int starfive_sha224_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "sha224-generic", + return starfive_hash_init_tfm(hash, "sha224-lib", STARFIVE_HASH_SHA224, 0); } static int starfive_sha256_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "sha256-generic", + return starfive_hash_init_tfm(hash, "sha256-lib", STARFIVE_HASH_SHA256, 0); } @@ -523,13 +523,13 @@ static int starfive_sm3_init_tfm(struct crypto_ahash *hash) static int starfive_hmac_sha224_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "hmac(sha224-generic)", + return starfive_hash_init_tfm(hash, "hmac-sha224-lib", STARFIVE_HASH_SHA224, 1); } static int starfive_hmac_sha256_init_tfm(struct crypto_ahash *hash) { - return starfive_hash_init_tfm(hash, "hmac(sha256-generic)", + return starfive_hash_init_tfm(hash, "hmac-sha256-lib", STARFIVE_HASH_SHA256, 1); } -- cgit v1.2.3 From 07f090959bba437dd531bd2e08440ec6bf588878 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:41 -0700 Subject: crypto: sha256 - Use same state format as legacy drivers Make the export and import functions for the sha224, sha256, hmac(sha224), and hmac(sha256) shash algorithms use the same format as the padlock-sha and nx-sha256 drivers, as required by Herbert. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/sha256.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/crypto/sha256.c b/crypto/sha256.c index d81166cbba95..052806559f06 100644 --- a/crypto/sha256.c +++ b/crypto/sha256.c @@ -13,6 +13,43 @@ #include #include +/* + * Export and import functions. crypto_shash wants a particular format that + * matches that used by some legacy drivers. It currently is the same as the + * library SHA context, except the value in bytecount must be block-aligned and + * the remainder must be stored in an extra u8 appended to the struct. + */ + +#define SHA256_SHASH_STATE_SIZE 105 +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +static_assert(sizeof(struct __sha256_ctx) + 1 == SHA256_SHASH_STATE_SIZE); + +static int __crypto_sha256_export(const struct __sha256_ctx *ctx0, void *out) +{ + struct __sha256_ctx ctx = *ctx0; + unsigned int partial; + u8 *p = out; + + partial = ctx.bytecount % SHA256_BLOCK_SIZE; + ctx.bytecount -= partial; + memcpy(p, &ctx, sizeof(ctx)); + p += sizeof(ctx); + *p = partial; + return 0; +} + +static int __crypto_sha256_import(struct __sha256_ctx *ctx, const void *in) +{ + const u8 *p = in; + + memcpy(ctx, p, sizeof(*ctx)); + p += sizeof(*ctx); + ctx->bytecount += *p; + return 0; +} + /* SHA-224 */ const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { @@ -51,6 +88,16 @@ static int crypto_sha224_digest(struct shash_desc *desc, return 0; } +static int crypto_sha224_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha256_export(&SHA224_CTX(desc)->ctx, out); +} + +static int crypto_sha224_import(struct shash_desc *desc, const void *in) +{ + return __crypto_sha256_import(&SHA224_CTX(desc)->ctx, in); +} + /* SHA-256 */ const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { @@ -89,6 +136,16 @@ static int crypto_sha256_digest(struct shash_desc *desc, return 0; } +static int crypto_sha256_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha256_export(&SHA256_CTX(desc)->ctx, out); +} + +static int crypto_sha256_import(struct shash_desc *desc, const void *in) +{ + return __crypto_sha256_import(&SHA256_CTX(desc)->ctx, in); +} + /* HMAC-SHA224 */ #define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm)) @@ -128,6 +185,19 @@ static int crypto_hmac_sha224_digest(struct shash_desc *desc, return 0; } +static int crypto_hmac_sha224_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha256_export(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out); +} + +static int crypto_hmac_sha224_import(struct shash_desc *desc, const void *in) +{ + struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc); + + ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate; + return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); +} + /* HMAC-SHA256 */ #define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm)) @@ -167,6 +237,19 @@ static int crypto_hmac_sha256_digest(struct shash_desc *desc, return 0; } +static int crypto_hmac_sha256_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha256_export(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out); +} + +static int crypto_hmac_sha256_import(struct shash_desc *desc, const void *in) +{ + struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc); + + ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate; + return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); +} + /* Algorithm definitions */ static struct shash_alg algs[] = { @@ -181,7 +264,10 @@ static struct shash_alg algs[] = { .update = crypto_sha224_update, .final = crypto_sha224_final, .digest = crypto_sha224_digest, + .export = crypto_sha224_export, + .import = crypto_sha224_import, .descsize = sizeof(struct sha224_ctx), + .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "sha256", @@ -194,7 +280,10 @@ static struct shash_alg algs[] = { .update = crypto_sha256_update, .final = crypto_sha256_final, .digest = crypto_sha256_digest, + .export = crypto_sha256_export, + .import = crypto_sha256_import, .descsize = sizeof(struct sha256_ctx), + .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha224)", @@ -209,7 +298,10 @@ static struct shash_alg algs[] = { .update = crypto_hmac_sha224_update, .final = crypto_hmac_sha224_final, .digest = crypto_hmac_sha224_digest, + .export = crypto_hmac_sha224_export, + .import = crypto_hmac_sha224_import, .descsize = sizeof(struct hmac_sha224_ctx), + .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha256)", @@ -224,7 +316,10 @@ static struct shash_alg algs[] = { .update = crypto_hmac_sha256_update, .final = crypto_hmac_sha256_final, .digest = crypto_hmac_sha256_digest, + .export = crypto_hmac_sha256_export, + .import = crypto_hmac_sha256_import, .descsize = sizeof(struct hmac_sha256_ctx), + .statesize = SHA256_SHASH_STATE_SIZE, }, }; -- cgit v1.2.3 From 9f9846a72eec406db9e1eadcad1dd5e90aa0f355 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:42 -0700 Subject: lib/crypto: sha256: Remove sha256_is_arch_optimized() Remove sha256_is_arch_optimized(), since it is no longer used. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/octeon-sha256.c | 6 ------ include/crypto/internal/sha2.h | 8 -------- lib/crypto/arm/sha256.c | 7 ------- lib/crypto/arm64/sha256.c | 7 ------- lib/crypto/powerpc/sha256.c | 6 ------ lib/crypto/riscv/sha256.c | 6 ------ lib/crypto/s390/sha256.c | 6 ------ lib/crypto/sparc/sha256.c | 6 ------ lib/crypto/x86/sha256.c | 6 ------ 9 files changed, 58 deletions(-) diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c index f8664818d04e..c7c67bdc2bd0 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c @@ -61,12 +61,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return octeon_has_crypto(); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)"); MODULE_AUTHOR("Aaro Koskinen "); diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h index f0f455477bbd..7915a3a46bc8 100644 --- a/include/crypto/internal/sha2.h +++ b/include/crypto/internal/sha2.h @@ -9,14 +9,6 @@ #include #include -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) -bool sha256_is_arch_optimized(void); -#else -static inline bool sha256_is_arch_optimized(void) -{ - return false; -} -#endif void sha256_blocks_generic(struct sha256_block_state *state, const u8 *data, size_t nblocks); void sha256_blocks_arch(struct sha256_block_state *state, diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c index 7d9082358695..27181be0aa92 100644 --- a/lib/crypto/arm/sha256.c +++ b/lib/crypto/arm/sha256.c @@ -37,13 +37,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init sha256_arm_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c index 609ffb815198..a5a498276708 100644 --- a/lib/crypto/arm64/sha256.c +++ b/lib/crypto/arm64/sha256.c @@ -47,13 +47,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init sha256_arm64_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c index 55f42403d572..14b8adcdcfc2 100644 --- a/lib/crypto/powerpc/sha256.c +++ b/lib/crypto/powerpc/sha256.c @@ -61,11 +61,5 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized"); diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c index a2079aa3ae92..01004cb9c6e9 100644 --- a/lib/crypto/riscv/sha256.c +++ b/lib/crypto/riscv/sha256.c @@ -34,12 +34,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_extensions); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init riscv64_sha256_mod_init(void) { /* Both zvknha and zvknhb provide the SHA-256 instructions. */ diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c index fb565718f753..6ebfd35a5d44 100644 --- a/lib/crypto/s390/sha256.c +++ b/lib/crypto/s390/sha256.c @@ -23,12 +23,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_cpacf_sha256); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init sha256_s390_mod_init(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c index 060664b88a6d..f41c109c1c18 100644 --- a/lib/crypto/sparc/sha256.c +++ b/lib/crypto/sparc/sha256.c @@ -32,12 +32,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_opcodes); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init sha256_sparc64_mod_init(void) { unsigned long cfr; diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c index cbb45defbefa..9ee38d2b3d57 100644 --- a/lib/crypto/x86/sha256.c +++ b/lib/crypto/x86/sha256.c @@ -37,12 +37,6 @@ void sha256_blocks_arch(struct sha256_block_state *state, } EXPORT_SYMBOL_GPL(sha256_blocks_arch); -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_x86); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - static int __init sha256_x86_mod_init(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { -- cgit v1.2.3 From e96cb9507f2d8ba150d417dcd283204564945831 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:43 -0700 Subject: lib/crypto: sha256: Consolidate into single module Consolidate the CPU-based SHA-256 code into a single module, following what I did with SHA-512: - Each arch now provides a header file lib/crypto/$(SRCARCH)/sha256.h, replacing lib/crypto/$(SRCARCH)/sha256.c. The header defines sha256_blocks() and optionally sha256_mod_init_arch(). It is included by lib/crypto/sha256.c, and thus the code gets built into the single libsha256 module, with proper inlining and dead code elimination. - sha256_blocks_generic() is moved from lib/crypto/sha256-generic.c into lib/crypto/sha256.c. It's now a static function marked with __maybe_unused, so the compiler automatically eliminates it in any cases where it's not used. - Whether arch-optimized SHA-256 is buildable is now controlled centrally by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig. The conditions for enabling it remain the same as before, and it remains enabled by default. - Any additional arch-specific translation units for the optimized SHA-256 code (such as assembly files) are now compiled by lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-13-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/Kconfig | 6 - arch/mips/cavium-octeon/crypto/Makefile | 1 - arch/mips/cavium-octeon/crypto/octeon-sha256.c | 66 ----------- include/crypto/internal/sha2.h | 52 --------- lib/crypto/Kconfig | 26 ++--- lib/crypto/Makefile | 39 ++++++- lib/crypto/arm/Kconfig | 6 - lib/crypto/arm/Makefile | 8 +- lib/crypto/arm/sha256.c | 57 ---------- lib/crypto/arm/sha256.h | 46 ++++++++ lib/crypto/arm64/Kconfig | 5 - lib/crypto/arm64/Makefile | 9 +- lib/crypto/arm64/sha256.c | 68 ----------- lib/crypto/arm64/sha256.h | 57 ++++++++++ lib/crypto/mips/sha256.h | 58 ++++++++++ lib/crypto/powerpc/Kconfig | 6 - lib/crypto/powerpc/Makefile | 3 - lib/crypto/powerpc/sha256.c | 65 ----------- lib/crypto/powerpc/sha256.h | 58 ++++++++++ lib/crypto/riscv/Kconfig | 7 -- lib/crypto/riscv/Makefile | 3 - lib/crypto/riscv/sha256.c | 56 --------- lib/crypto/riscv/sha256.h | 42 +++++++ lib/crypto/s390/Kconfig | 6 - lib/crypto/s390/Makefile | 3 - lib/crypto/s390/sha256.c | 41 ------- lib/crypto/s390/sha256.h | 28 +++++ lib/crypto/sha256-generic.c | 150 ------------------------- lib/crypto/sha256.c | 146 +++++++++++++++++++++--- lib/crypto/sparc/Kconfig | 8 -- lib/crypto/sparc/Makefile | 4 - lib/crypto/sparc/sha256.c | 58 ---------- lib/crypto/sparc/sha256.h | 43 +++++++ lib/crypto/x86/Kconfig | 7 -- lib/crypto/x86/Makefile | 3 - lib/crypto/x86/sha256.c | 68 ----------- lib/crypto/x86/sha256.h | 55 +++++++++ 37 files changed, 565 insertions(+), 799 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-sha256.c delete mode 100644 include/crypto/internal/sha2.h delete mode 100644 lib/crypto/arm/sha256.c create mode 100644 lib/crypto/arm/sha256.h delete mode 100644 lib/crypto/arm64/sha256.c create mode 100644 lib/crypto/arm64/sha256.h create mode 100644 lib/crypto/mips/sha256.h delete mode 100644 lib/crypto/powerpc/sha256.c create mode 100644 lib/crypto/powerpc/sha256.h delete mode 100644 lib/crypto/riscv/sha256.c create mode 100644 lib/crypto/riscv/sha256.h delete mode 100644 lib/crypto/s390/sha256.c create mode 100644 lib/crypto/s390/sha256.h delete mode 100644 lib/crypto/sha256-generic.c delete mode 100644 lib/crypto/sparc/Kconfig delete mode 100644 lib/crypto/sparc/Makefile delete mode 100644 lib/crypto/sparc/sha256.c create mode 100644 lib/crypto/sparc/sha256.h delete mode 100644 lib/crypto/x86/sha256.c create mode 100644 lib/crypto/x86/sha256.h diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig index 11f4aa6e80e9..450e979ef5d9 100644 --- a/arch/mips/cavium-octeon/Kconfig +++ b/arch/mips/cavium-octeon/Kconfig @@ -23,12 +23,6 @@ config CAVIUM_OCTEON_CVMSEG_SIZE legally range is from zero to 54 cache blocks (i.e. CVMSEG LM is between zero and 6192 bytes). -config CRYPTO_SHA256_OCTEON - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC - endif # CPU_CAVIUM_OCTEON if CAVIUM_OCTEON_SOC diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile index 168b19ef7ce8..db428e4b30bc 100644 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ b/arch/mips/cavium-octeon/crypto/Makefile @@ -7,4 +7,3 @@ obj-y += octeon-crypto.o obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o obj-$(CONFIG_CRYPTO_SHA1_OCTEON) += octeon-sha1.o -obj-$(CONFIG_CRYPTO_SHA256_OCTEON) += octeon-sha256.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c deleted file mode 100644 index c7c67bdc2bd0..000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 Secure Hash Algorithm. - * - * Adapted for OCTEON by Aaro Koskinen . - * - * Based on crypto/sha256_generic.c, which is: - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * SHA224 Support Copyright 2007 Intel Corporation - */ - -#include -#include -#include -#include -#include - -/* - * We pass everything as 64-bit. OCTEON can handle misaligned data. - */ - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - struct octeon_cop2_state cop2_state; - u64 *state64 = (u64 *)state; - unsigned long flags; - - if (!octeon_has_crypto()) - return sha256_blocks_generic(state, data, nblocks); - - flags = octeon_crypto_enable(&cop2_state); - write_octeon_64bit_hash_dword(state64[0], 0); - write_octeon_64bit_hash_dword(state64[1], 1); - write_octeon_64bit_hash_dword(state64[2], 2); - write_octeon_64bit_hash_dword(state64[3], 3); - - do { - const u64 *block = (const u64 *)data; - - write_octeon_64bit_block_dword(block[0], 0); - write_octeon_64bit_block_dword(block[1], 1); - write_octeon_64bit_block_dword(block[2], 2); - write_octeon_64bit_block_dword(block[3], 3); - write_octeon_64bit_block_dword(block[4], 4); - write_octeon_64bit_block_dword(block[5], 5); - write_octeon_64bit_block_dword(block[6], 6); - octeon_sha256_start(block[7]); - - data += SHA256_BLOCK_SIZE; - } while (--nblocks); - - state64[0] = read_octeon_64bit_hash_dword(0); - state64[1] = read_octeon_64bit_hash_dword(1); - state64[2] = read_octeon_64bit_hash_dword(2); - state64[3] = read_octeon_64bit_hash_dword(3); - octeon_crypto_disable(&cop2_state, flags); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)"); -MODULE_AUTHOR("Aaro Koskinen "); diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h deleted file mode 100644 index 7915a3a46bc8..000000000000 --- a/include/crypto/internal/sha2.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ - -#ifndef _CRYPTO_INTERNAL_SHA2_H -#define _CRYPTO_INTERNAL_SHA2_H - -#include -#include -#include -#include -#include - -void sha256_blocks_generic(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -static __always_inline void sha256_choose_blocks( - u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks, - bool force_generic, bool force_simd) -{ - if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic) - sha256_blocks_generic((struct sha256_block_state *)state, data, nblocks); - else - sha256_blocks_arch((struct sha256_block_state *)state, data, nblocks); -} - -static __always_inline void sha256_finup( - struct crypto_sha256_state *sctx, u8 buf[SHA256_BLOCK_SIZE], - size_t len, u8 out[SHA256_DIGEST_SIZE], size_t digest_size, - bool force_generic, bool force_simd) -{ - const size_t bit_offset = SHA256_BLOCK_SIZE - 8; - __be64 *bits = (__be64 *)&buf[bit_offset]; - int i; - - buf[len++] = 0x80; - if (len > bit_offset) { - memset(&buf[len], 0, SHA256_BLOCK_SIZE - len); - sha256_choose_blocks(sctx->state, buf, 1, force_generic, - force_simd); - len = 0; - } - - memset(&buf[len], 0, bit_offset - len); - *bits = cpu_to_be64(sctx->count << 3); - sha256_choose_blocks(sctx->state, buf, 1, force_generic, force_simd); - - for (i = 0; i < digest_size; i += 4) - put_unaligned_be32(sctx->state[i / 4], out + i); -} - -#endif /* _CRYPTO_INTERNAL_SHA2_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 9bd740475a89..3305c6908581 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -144,20 +144,17 @@ config CRYPTO_LIB_SHA256 by either the generic implementation or an arch-specific one, if one is available and enabled. -config CRYPTO_ARCH_HAVE_LIB_SHA256 +config CRYPTO_LIB_SHA256_ARCH bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the SHA-256 library interface. - -config CRYPTO_LIB_SHA256_GENERIC - tristate - default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256 - help - This symbol can be selected by arch implementations of the SHA-256 - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_SHA256. + depends on CRYPTO_LIB_SHA256 && !UML + default y if ARM && !CPU_V7M + default y if ARM64 + default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC && SPE + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SHA512 tristate @@ -199,9 +196,6 @@ endif if S390 source "lib/crypto/s390/Kconfig" endif -if SPARC -source "lib/crypto/sparc/Kconfig" -endif if X86 source "lib/crypto/x86/Kconfig" endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 5823137fa5a8..a887bf103bf0 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -66,11 +66,39 @@ libpoly1305-generic-y += poly1305-generic.o obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o libsha1-y := sha1.o -obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o -libsha256-y := sha256.o +################################################################################ -obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC) += libsha256-generic.o -libsha256-generic-y := sha256-generic.o +obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o +libsha256-y := sha256.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y) +CFLAGS_sha256.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libsha256-y += arm/sha256-ce.o arm/sha256-core.o +$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl + $(call cmd,perlasm) +clean-files += arm/sha256-core.S +AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y) +endif + +ifeq ($(CONFIG_ARM64),y) +libsha256-y += arm64/sha256-core.o +$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl + $(call cmd,perlasm_with_args) +clean-files += arm64/sha256-core.S +libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o +endif + +libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o +libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o +libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o +libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \ + x86/sha256-avx-asm.o \ + x86/sha256-avx2-asm.o \ + x86/sha256-ni-asm.o +endif # CONFIG_CRYPTO_LIB_SHA256_ARCH + +################################################################################ obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o libsha512-y := sha512.o @@ -100,6 +128,8 @@ libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \ x86/sha512-avx2-asm.o endif # CONFIG_CRYPTO_LIB_SHA512_ARCH +################################################################################ + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o @@ -113,5 +143,4 @@ obj-$(CONFIG_MIPS) += mips/ obj-$(CONFIG_PPC) += powerpc/ obj-$(CONFIG_RISCV) += riscv/ obj-$(CONFIG_S390) += s390/ -obj-$(CONFIG_SPARC) += sparc/ obj-$(CONFIG_X86) += x86/ diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index 9f3ff30f4032..e8444fd0aae3 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -22,9 +22,3 @@ config CRYPTO_POLY1305_ARM tristate default CRYPTO_LIB_POLY1305 select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_ARM - tristate - depends on !CPU_V7M - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile index 431f77c3ff6f..4c042a4c77ed 100644 --- a/lib/crypto/arm/Makefile +++ b/lib/crypto/arm/Makefile @@ -10,17 +10,13 @@ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o poly1305-arm-y := poly1305-core.o poly1305-glue.o -obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o -sha256-arm-y := sha256.o sha256-core.o -sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o - quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $(<) > $(@) $(obj)/%-core.S: $(src)/%-armv4.pl $(call cmd,perl) -clean-files += poly1305-core.S sha256-core.S +clean-files += poly1305-core.S aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 @@ -28,5 +24,3 @@ aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) - -AFLAGS_sha256-core.o += $(aflags-thumb2-y) diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c deleted file mode 100644 index 27181be0aa92..000000000000 --- a/lib/crypto/arm/sha256.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for ARM - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include - -asmlinkage void sha256_block_data_order(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_ce_transform(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - if (static_branch_likely(&have_ce)) - sha256_ce_transform(state, data, nblocks); - else - sha256_block_data_order_neon(state, data, nblocks); - kernel_neon_end(); - } else { - sha256_block_data_order(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init sha256_arm_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { - static_branch_enable(&have_neon); - if (elf_hwcap2 & HWCAP2_SHA2) - static_branch_enable(&have_ce); - } - return 0; -} -subsys_initcall(sha256_arm_mod_init); - -static void __exit sha256_arm_mod_exit(void) -{ -} -module_exit(sha256_arm_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for ARM"); diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h new file mode 100644 index 000000000000..da75cbdc51d4 --- /dev/null +++ b/lib/crypto/arm/sha256.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include +#include + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_ce_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha256_ce_transform(state, data, nblocks); + else + sha256_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha256_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA2) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig index 49e57bfdb5b5..0b903ef524d8 100644 --- a/lib/crypto/arm64/Kconfig +++ b/lib/crypto/arm64/Kconfig @@ -12,8 +12,3 @@ config CRYPTO_POLY1305_NEON depends on KERNEL_MODE_NEON default CRYPTO_LIB_POLY1305 select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_ARM64 - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile index 946c09903711..6207088397a7 100644 --- a/lib/crypto/arm64/Makefile +++ b/lib/crypto/arm64/Makefile @@ -8,17 +8,10 @@ poly1305-neon-y := poly1305-core.o poly1305-glue.o AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch -obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o -sha256-arm64-y := sha256.o sha256-core.o -sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o - quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $(<) void $(@) $(obj)/%-core.S: $(src)/%-armv8.pl $(call cmd,perlasm) -$(obj)/sha256-core.S: $(src)/sha2-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S sha256-core.S +clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c deleted file mode 100644 index a5a498276708..000000000000 --- a/lib/crypto/arm64/sha256.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for ARM64 - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include - -asmlinkage void sha256_block_data_order(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_block_neon(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { - if (static_branch_likely(&have_ce)) { - do { - size_t rem; - - kernel_neon_begin(); - rem = __sha256_ce_transform(state, - data, nblocks); - kernel_neon_end(); - data += (nblocks - rem) * SHA256_BLOCK_SIZE; - nblocks = rem; - } while (nblocks); - } else { - kernel_neon_begin(); - sha256_block_neon(state, data, nblocks); - kernel_neon_end(); - } - } else { - sha256_block_data_order(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init sha256_arm64_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - cpu_have_named_feature(ASIMD)) { - static_branch_enable(&have_neon); - if (cpu_have_named_feature(SHA2)) - static_branch_enable(&have_ce); - } - return 0; -} -subsys_initcall(sha256_arm64_mod_init); - -static void __exit sha256_arm64_mod_exit(void) -{ -} -module_exit(sha256_arm64_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for ARM64"); diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h new file mode 100644 index 000000000000..a211966c124a --- /dev/null +++ b/lib/crypto/arm64/sha256.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include +#include +#include + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_block_neon(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && crypto_simd_usable()) { + if (static_branch_likely(&have_ce)) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha256_ce_transform(state, + data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA256_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + kernel_neon_begin(); + sha256_block_neon(state, data, nblocks); + kernel_neon_end(); + } + } else { + sha256_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_neon); + if (cpu_have_named_feature(SHA2)) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h new file mode 100644 index 000000000000..ccccfd131634 --- /dev/null +++ b/lib/crypto/mips/sha256.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen . + * + * Based on crypto/sha256_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2002 James Morris + * SHA224 Support Copyright 2007 Intel Corporation + */ + +#include +#include + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + u64 *state64 = (u64 *)state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha256_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_dword(state64[0], 0); + write_octeon_64bit_hash_dword(state64[1], 1); + write_octeon_64bit_hash_dword(state64[2], 2); + write_octeon_64bit_hash_dword(state64[3], 3); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha256_start(block[7]); + + data += SHA256_BLOCK_SIZE; + } while (--nblocks); + + state64[0] = read_octeon_64bit_hash_dword(0); + state64[1] = read_octeon_64bit_hash_dword(1); + state64[2] = read_octeon_64bit_hash_dword(2); + state64[3] = read_octeon_64bit_hash_dword(3); + octeon_crypto_disable(&cop2_state, flags); +} diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig index 3f9e1bbd9905..2eaeb7665a6a 100644 --- a/lib/crypto/powerpc/Kconfig +++ b/lib/crypto/powerpc/Kconfig @@ -14,9 +14,3 @@ config CRYPTO_POLY1305_P10 default CRYPTO_LIB_POLY1305 select CRYPTO_ARCH_HAVE_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC - -config CRYPTO_SHA256_PPC_SPE - tristate - depends on SPE - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile index 27f231f8e334..5709ae14258a 100644 --- a/lib/crypto/powerpc/Makefile +++ b/lib/crypto/powerpc/Makefile @@ -5,6 +5,3 @@ chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o - -obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o -sha256-ppc-spe-y := sha256.o sha256-spe-asm.o diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c deleted file mode 100644 index 14b8adcdcfc2..000000000000 --- a/lib/crypto/powerpc/sha256.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 Secure Hash Algorithm, SPE optimized - * - * Based on generic implementation. The assembler module takes care - * about the SPE registers so it can run from interrupt context. - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include -#include -#include -#include - -/* - * MAX_BYTES defines the number of bytes that are allowed to be processed - * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 - * operations per 64 bytes. e500 cores can issue two arithmetic instructions - * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). - * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. - * Headroom for cache misses included. Even with the low end model clocked - * at 667 MHz this equals to a critical time window of less than 27us. - * - */ -#define MAX_BYTES 1024 - -extern void ppc_spe_sha256_transform(struct sha256_block_state *state, - const u8 *src, u32 blocks); - -static void spe_begin(void) -{ - /* We just start SPE operations and will save SPE registers later. */ - preempt_disable(); - enable_kernel_spe(); -} - -static void spe_end(void) -{ - disable_kernel_spe(); - /* reenable preemption */ - preempt_enable(); -} - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - do { - /* cut input data into smaller blocks */ - u32 unit = min_t(size_t, nblocks, - MAX_BYTES / SHA256_BLOCK_SIZE); - - spe_begin(); - ppc_spe_sha256_transform(state, data, unit); - spe_end(); - - data += unit * SHA256_BLOCK_SIZE; - nblocks -= unit; - } while (nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized"); diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h new file mode 100644 index 000000000000..50d355441c7e --- /dev/null +++ b/lib/crypto/powerpc/sha256.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 Secure Hash Algorithm, SPE optimized + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen + */ + +#include +#include + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 1024 + +extern void ppc_spe_sha256_transform(struct sha256_block_state *state, + const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + /* cut input data into smaller blocks */ + u32 unit = min_t(size_t, nblocks, + MAX_BYTES / SHA256_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha256_transform(state, data, unit); + spe_end(); + + data += unit * SHA256_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig index c100571feb7e..bc7a43f33eb3 100644 --- a/lib/crypto/riscv/Kconfig +++ b/lib/crypto/riscv/Kconfig @@ -6,10 +6,3 @@ config CRYPTO_CHACHA_RISCV64 default CRYPTO_LIB_CHACHA select CRYPTO_ARCH_HAVE_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC - -config CRYPTO_SHA256_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile index b7cb877a2c07..e27b78f317fc 100644 --- a/lib/crypto/riscv/Makefile +++ b/lib/crypto/riscv/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o - -obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o -sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c deleted file mode 100644 index 01004cb9c6e9..000000000000 --- a/lib/crypto/riscv/sha256.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 (RISC-V accelerated) - * - * Copyright (C) 2022 VRULL GmbH - * Author: Heiko Stuebner - * - * Copyright (C) 2023 SiFive, Inc. - * Author: Jerry Shih - */ - -#include -#include -#include -#include -#include - -asmlinkage void -sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { - kernel_vector_begin(); - sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); - kernel_vector_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init riscv64_sha256_mod_init(void) -{ - /* Both zvknha and zvknhb provide the SHA-256 instructions. */ - if ((riscv_isa_extension_available(NULL, ZVKNHA) || - riscv_isa_extension_available(NULL, ZVKNHB)) && - riscv_isa_extension_available(NULL, ZVKB) && - riscv_vector_vlen() >= 128) - static_branch_enable(&have_extensions); - return 0; -} -subsys_initcall(riscv64_sha256_mod_init); - -static void __exit riscv64_sha256_mod_exit(void) -{ -} -module_exit(riscv64_sha256_mod_exit); - -MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)"); -MODULE_AUTHOR("Heiko Stuebner "); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h new file mode 100644 index 000000000000..c0f79c18f119 --- /dev/null +++ b/lib/crypto/riscv/sha256.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 (RISC-V accelerated) + * + * Copyright (C) 2022 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include + +asmlinkage void +sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { + kernel_vector_begin(); + sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); + kernel_vector_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + /* Both zvknha and zvknhb provide the SHA-256 instructions. */ + if ((riscv_isa_extension_available(NULL, ZVKNHA) || + riscv_isa_extension_available(NULL, ZVKNHB)) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&have_extensions); +} diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig index e3f855ef4393..069b355fe51a 100644 --- a/lib/crypto/s390/Kconfig +++ b/lib/crypto/s390/Kconfig @@ -5,9 +5,3 @@ config CRYPTO_CHACHA_S390 default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_SHA256_S390 - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile index 5df30f1e7930..06c2cf77178e 100644 --- a/lib/crypto/s390/Makefile +++ b/lib/crypto/s390/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o chacha_s390-y := chacha-glue.o chacha-s390.o - -obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o -sha256-s390-y := sha256.o diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c deleted file mode 100644 index 6ebfd35a5d44..000000000000 --- a/lib/crypto/s390/sha256.c +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_cpacf_sha256)) - cpacf_kimd(CPACF_KIMD_SHA_256, state, data, - nblocks * SHA256_BLOCK_SIZE); - else - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init sha256_s390_mod_init(void) -{ - if (cpu_have_feature(S390_CPU_FEATURE_MSA) && - cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - static_branch_enable(&have_cpacf_sha256); - return 0; -} -subsys_initcall(sha256_s390_mod_init); - -static void __exit sha256_s390_mod_exit(void) -{ -} -module_exit(sha256_s390_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)"); diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h new file mode 100644 index 000000000000..70a81cbc06b2 --- /dev/null +++ b/lib/crypto/s390/sha256.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha256)) + cpacf_kimd(CPACF_KIMD_SHA_256, state, data, + nblocks * SHA256_BLOCK_SIZE); + else + sha256_blocks_generic(state, data, nblocks); +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) + static_branch_enable(&have_cpacf_sha256); +} diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c deleted file mode 100644 index 99f904033c26..000000000000 --- a/lib/crypto/sha256-generic.c +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke . - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * Copyright (c) 2014 Red Hat Inc. - */ - -#include -#include -#include -#include -#include -#include - -static const u32 SHA256_K[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, -}; - -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} - -#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) -#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) -#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) -#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) - -static inline void LOAD_OP(int I, u32 *W, const u8 *input) -{ - W[I] = get_unaligned_be32((__u32 *)input + I); -} - -static inline void BLEND_OP(int I, u32 *W) -{ - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; -} - -#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \ - u32 t1, t2; \ - t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \ - t2 = e0(a) + Maj(a, b, c); \ - d += t1; \ - h = t1 + t2; \ -} while (0) - -static void sha256_block_generic(struct sha256_block_state *state, - const u8 *input, u32 W[64]) -{ - u32 a, b, c, d, e, f, g, h; - int i; - - /* load the input */ - for (i = 0; i < 16; i += 8) { - LOAD_OP(i + 0, W, input); - LOAD_OP(i + 1, W, input); - LOAD_OP(i + 2, W, input); - LOAD_OP(i + 3, W, input); - LOAD_OP(i + 4, W, input); - LOAD_OP(i + 5, W, input); - LOAD_OP(i + 6, W, input); - LOAD_OP(i + 7, W, input); - } - - /* now blend */ - for (i = 16; i < 64; i += 8) { - BLEND_OP(i + 0, W); - BLEND_OP(i + 1, W); - BLEND_OP(i + 2, W); - BLEND_OP(i + 3, W); - BLEND_OP(i + 4, W); - BLEND_OP(i + 5, W); - BLEND_OP(i + 6, W); - BLEND_OP(i + 7, W); - } - - /* load the state into our registers */ - a = state->h[0]; - b = state->h[1]; - c = state->h[2]; - d = state->h[3]; - e = state->h[4]; - f = state->h[5]; - g = state->h[6]; - h = state->h[7]; - - /* now iterate */ - for (i = 0; i < 64; i += 8) { - SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); - SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); - SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); - SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); - SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); - SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); - SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); - SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); - } - - state->h[0] += a; - state->h[1] += b; - state->h[2] += c; - state->h[3] += d; - state->h[4] += e; - state->h[5] += f; - state->h[6] += g; - state->h[7] += h; -} - -void sha256_blocks_generic(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - u32 W[64]; - - do { - sha256_block_generic(state, data, W); - data += SHA256_BLOCK_SIZE; - } while (--nblocks); - - memzero_explicit(W, sizeof(W)); -} -EXPORT_SYMBOL_GPL(sha256_blocks_generic); - -MODULE_DESCRIPTION("SHA-256 Algorithm (generic implementation)"); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 12b4b59052c4..68936d5cd774 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -6,15 +6,17 @@ * Copyright (c) Andrew McDonald * Copyright (c) 2002 James Morris * Copyright (c) 2014 Red Hat Inc. + * Copyright 2025 Google LLC */ #include #include -#include +#include #include #include #include #include +#include #include static const struct sha256_block_state sha224_iv = { @@ -31,26 +33,128 @@ static const struct sha256_block_state sha256_iv = { }, }; -/* - * If __DISABLE_EXPORTS is defined, then this file is being compiled for a - * pre-boot environment. In that case, ignore the kconfig options, pull the - * generic code into the same translation unit, and use that only. - */ -#ifdef __DISABLE_EXPORTS -#include "sha256-generic.c" -#endif +static const u32 sha256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, + 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, + 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, + 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, + 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y)))) +#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) +#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) +#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) +#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = get_unaligned_be32((__u32 *)input + I); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16]; +} -static inline bool sha256_purgatory(void) +#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ + do { \ + u32 t1, t2; \ + t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \ + t2 = e0(a) + Maj(a, b, c); \ + d += t1; \ + h = t1 + t2; \ + } while (0) + +static void sha256_block_generic(struct sha256_block_state *state, + const u8 *input, u32 W[64]) { - return __is_defined(__DISABLE_EXPORTS); + u32 a, b, c, d, e, f, g, h; + int i; + + /* load the input */ + for (i = 0; i < 16; i += 8) { + LOAD_OP(i + 0, W, input); + LOAD_OP(i + 1, W, input); + LOAD_OP(i + 2, W, input); + LOAD_OP(i + 3, W, input); + LOAD_OP(i + 4, W, input); + LOAD_OP(i + 5, W, input); + LOAD_OP(i + 6, W, input); + LOAD_OP(i + 7, W, input); + } + + /* now blend */ + for (i = 16; i < 64; i += 8) { + BLEND_OP(i + 0, W); + BLEND_OP(i + 1, W); + BLEND_OP(i + 2, W); + BLEND_OP(i + 3, W); + BLEND_OP(i + 4, W); + BLEND_OP(i + 5, W); + BLEND_OP(i + 6, W); + BLEND_OP(i + 7, W); + } + + /* load the state into our registers */ + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + e = state->h[4]; + f = state->h[5]; + g = state->h[6]; + h = state->h[7]; + + /* now iterate */ + for (i = 0; i < 64; i += 8) { + SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); + SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); + SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); + SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); + SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); + SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); + SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); + SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); + } + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; + state->h[4] += e; + state->h[5] += f; + state->h[6] += g; + state->h[7] += h; } -static inline void sha256_blocks(struct sha256_block_state *state, - const u8 *data, size_t nblocks) +static void __maybe_unused +sha256_blocks_generic(struct sha256_block_state *state, + const u8 *data, size_t nblocks) { - sha256_choose_blocks(state->h, data, nblocks, sha256_purgatory(), false); + u32 W[64]; + + do { + sha256_block_generic(state, data, W); + data += SHA256_BLOCK_SIZE; + } while (--nblocks); + + memzero_explicit(W, sizeof(W)); } +#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS) +#include "sha256.h" /* $(SRCARCH)/sha256.h */ +#else +#define sha256_blocks sha256_blocks_generic +#endif + static void __sha256_init(struct __sha256_ctx *ctx, const struct sha256_block_state *iv, u64 initial_bytecount) @@ -273,5 +377,19 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey); #endif /* !__DISABLE_EXPORTS */ +#ifdef sha256_mod_init_arch +static int __init sha256_mod_init(void) +{ + sha256_mod_init_arch(); + return 0; +} +subsys_initcall(sha256_mod_init); + +static void __exit sha256_mod_exit(void) +{ +} +module_exit(sha256_mod_exit); +#endif + MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sparc/Kconfig b/lib/crypto/sparc/Kconfig deleted file mode 100644 index e5c3e4d3dba6..000000000000 --- a/lib/crypto/sparc/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_SHA256_SPARC64 - tristate - depends on SPARC64 - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/sparc/Makefile b/lib/crypto/sparc/Makefile deleted file mode 100644 index 75ee244ad6f7..000000000000 --- a/lib/crypto/sparc/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o -sha256-sparc64-y := sha256.o sha256_asm.o diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c deleted file mode 100644 index f41c109c1c18..000000000000 --- a/lib/crypto/sparc/sha256.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * SHA-256 accelerated using the sparc64 sha256 opcodes - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * SHA224 Support Copyright 2007 Intel Corporation - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); - -asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_opcodes)) - sha256_sparc64_transform(state, data, nblocks); - else - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init sha256_sparc64_mod_init(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return 0; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_SHA256)) - return 0; - - static_branch_enable(&have_sha256_opcodes); - pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); - return 0; -} -subsys_initcall(sha256_sparc64_mod_init); - -static void __exit sha256_sparc64_mod_exit(void) -{ -} -module_exit(sha256_sparc64_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes"); diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h new file mode 100644 index 000000000000..1d10108eb195 --- /dev/null +++ b/lib/crypto/sparc/sha256.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-256 accelerated using the sparc64 sha256 opcodes + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2002 James Morris + * SHA224 Support Copyright 2007 Intel Corporation + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes); + +asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_opcodes)) + sha256_sparc64_transform(state, data, nblocks); + else + sha256_blocks_generic(state, data, nblocks); +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA256)) + return; + + static_branch_enable(&have_sha256_opcodes); + pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); +} diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index e344579db3d8..546fe2afe0b5 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -24,10 +24,3 @@ config CRYPTO_POLY1305_X86_64 depends on 64BIT default CRYPTO_LIB_POLY1305 select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_LIB_SHA256_GENERIC diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile index abceca3d31c0..c2ff8c5f1046 100644 --- a/lib/crypto/x86/Makefile +++ b/lib/crypto/x86/Makefile @@ -10,9 +10,6 @@ obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o targets += poly1305-x86_64-cryptogams.S -obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o -sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o - quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c deleted file mode 100644 index 9ee38d2b3d57..000000000000 --- a/lib/crypto/x86/sha256.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for x86_64 - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include -#include -#include - -asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(struct sha256_block_state *state, - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); - -DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); - -void sha256_blocks_arch(struct sha256_block_state *state, - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) { - kernel_fpu_begin(); - static_call(sha256_blocks_x86)(state, data, nblocks); - kernel_fpu_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -static int __init sha256_x86_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - static_call_update(sha256_blocks_x86, sha256_ni_transform); - } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | - XFEATURE_MASK_YMM, NULL) && - boot_cpu_has(X86_FEATURE_AVX)) { - if (boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - static_call_update(sha256_blocks_x86, - sha256_transform_rorx); - else - static_call_update(sha256_blocks_x86, - sha256_transform_avx); - } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { - return 0; - } - static_branch_enable(&have_sha256_x86); - return 0; -} -subsys_initcall(sha256_x86_mod_init); - -static void __exit sha256_x86_mod_exit(void) -{ -} -module_exit(sha256_x86_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h new file mode 100644 index 000000000000..3b5456c222ba --- /dev/null +++ b/lib/crypto/x86/sha256.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include +#include +#include + +asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_avx(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_rorx(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_ni_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); + +DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) { + kernel_fpu_begin(); + static_call(sha256_blocks_x86)(state, data, nblocks); + kernel_fpu_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} + +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha256_blocks_x86, sha256_ni_transform); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha256_blocks_x86, + sha256_transform_rorx); + else + static_call_update(sha256_blocks_x86, + sha256_transform_avx); + } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { + return; + } + static_branch_enable(&have_sha256_x86); +} -- cgit v1.2.3 From 773d2b99bb76131f65f98828221142ccd2cec856 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:44 -0700 Subject: lib/crypto: sha256: Sync sha256_update() with sha512_update() The BLOCK_HASH_UPDATE_BLOCKS macro is difficult to read. For now, let's just write the update explicitly in the straightforward way, mirroring sha512_update(). It's possible that we'll bring back a macro for this later, but it needs to be properly justified and hopefully a bit more readable. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-14-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/sha256.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 68936d5cd774..808438d4f427 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -180,8 +179,31 @@ void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len) size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; ctx->bytecount += len; - BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, &ctx->state, data, len, - SHA256_BLOCK_SIZE, ctx->buf, partial); + + if (partial + len >= SHA256_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA256_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha256_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA256_BLOCK_SIZE; + len %= SHA256_BLOCK_SIZE; + + if (nblocks) { + sha256_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA256_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); } EXPORT_SYMBOL(__sha256_update); -- cgit v1.2.3 From b34c9803aabd85189ffacc0d3cdb9ce4515c2b4d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:45 -0700 Subject: lib/crypto: sha256: Document the SHA-224 and SHA-256 API Add kerneldoc comments, consistent with the kerneldoc comments of the SHA-384 and SHA-512 API. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-15-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 2e3fc2cf4aa0..e0a08f6addd0 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -155,13 +155,51 @@ struct sha224_ctx { struct __sha256_ctx ctx; }; +/** + * sha224_init() - Initialize a SHA-224 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider sha224() instead. + * + * Context: Any context. + */ void sha224_init(struct sha224_ctx *ctx); + +/** + * sha224_update() - Update a SHA-224 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ static inline void sha224_update(struct sha224_ctx *ctx, const u8 *data, size_t len) { __sha256_update(&ctx->ctx, data, len); } + +/** + * sha224_final() - Finish computing a SHA-224 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting SHA-224 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); + +/** + * sha224() - Compute SHA-224 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting SHA-224 message digest + * + * Context: Any context. + */ void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]); /** @@ -275,13 +313,51 @@ struct sha256_ctx { struct __sha256_ctx ctx; }; +/** + * sha256_init() - Initialize a SHA-256 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider sha256() instead. + * + * Context: Any context. + */ void sha256_init(struct sha256_ctx *ctx); + +/** + * sha256_update() - Update a SHA-256 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ static inline void sha256_update(struct sha256_ctx *ctx, const u8 *data, size_t len) { __sha256_update(&ctx->ctx, data, len); } + +/** + * sha256_final() - Finish computing a SHA-256 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting SHA-256 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); + +/** + * sha256() - Compute SHA-256 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting SHA-256 message digest + * + * Context: Any context. + */ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); /** -- cgit v1.2.3 From a8c60a9aca778d7fd22d6c9b1af702d6f952b87f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 3 Jul 2025 19:39:57 -0700 Subject: lib/crypto: x86/sha256: Move static_call above kernel-mode FPU section As I did for sha512_blocks(), reorganize x86's sha256_blocks() to be just a static_call. To achieve that, for each assembly function add a C function that handles the kernel-mode FPU section and fallback. While this increases total code size slightly, the amount of code actually executed on a given system does not increase, and it is slightly more efficient since it eliminates the extra static_key. It also makes the assembly functions be called with standard direct calls instead of static calls, eliminating the need for ANNOTATE_NOENDBR. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250704023958.73274-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha256-avx-asm.S | 3 --- lib/crypto/x86/sha256-avx2-asm.S | 3 --- lib/crypto/x86/sha256-ni-asm.S | 2 -- lib/crypto/x86/sha256-ssse3-asm.S | 3 --- lib/crypto/x86/sha256.h | 48 ++++++++++++++++++++------------------- 5 files changed, 25 insertions(+), 34 deletions(-) diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S index 73bcff2b548f..798a7f07fa01 100644 --- a/lib/crypto/x86/sha256-avx-asm.S +++ b/lib/crypto/x86/sha256-avx-asm.S @@ -48,7 +48,6 @@ ######################################################################## #include -#include ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -346,8 +345,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_avx) - ANNOTATE_NOENDBR # since this is called only via static_call - pushq %rbx pushq %r12 pushq %r13 diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S index 45787570387f..62a46993359e 100644 --- a/lib/crypto/x86/sha256-avx2-asm.S +++ b/lib/crypto/x86/sha256-avx2-asm.S @@ -49,7 +49,6 @@ ######################################################################## #include -#include ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -523,8 +522,6 @@ STACK_SIZE = _CTX + _CTX_SIZE ######################################################################## .text SYM_FUNC_START(sha256_transform_rorx) - ANNOTATE_NOENDBR # since this is called only via static_call - pushq %rbx pushq %r12 pushq %r13 diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 4af7d22e29e4..9ebbacbb9c13 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -54,7 +54,6 @@ */ #include -#include #define STATE_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ @@ -111,7 +110,6 @@ */ .text SYM_FUNC_START(sha256_ni_transform) - ANNOTATE_NOENDBR # since this is called only via static_call shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S index 407b30adcd37..820fc8bbc29f 100644 --- a/lib/crypto/x86/sha256-ssse3-asm.S +++ b/lib/crypto/x86/sha256-ssse3-asm.S @@ -47,7 +47,6 @@ ######################################################################## #include -#include ## assume buffers not aligned #define MOVDQ movdqu @@ -353,8 +352,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_ssse3) - ANNOTATE_NOENDBR # since this is called only via static_call - pushq %rbx pushq %r12 pushq %r13 diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 3b5456c222ba..669bc06538b6 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -8,48 +8,50 @@ #include #include -asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(struct sha256_block_state *state, - const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(struct sha256_block_state *state, - const u8 *data, size_t nblocks); +DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); +#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha256_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha256_block_state *state, const u8 *data, \ + size_t nblocks) \ + { \ + if (likely(crypto_simd_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha256_blocks_generic(state, data, nblocks); \ + } \ + } -DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); +DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); +DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); +DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); +DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) { - kernel_fpu_begin(); - static_call(sha256_blocks_x86)(state, data, nblocks); - kernel_fpu_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } + static_call(sha256_blocks_x86)(state, data, nblocks); } #define sha256_mod_init_arch sha256_mod_init_arch static inline void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - static_call_update(sha256_blocks_x86, sha256_ni_transform); + static_call_update(sha256_blocks_x86, sha256_blocks_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, - sha256_transform_rorx); + sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, - sha256_transform_avx); - } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { - return; + sha256_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } - static_branch_enable(&have_sha256_x86); } -- cgit v1.2.3 From 57b15e9260a31438e91cf83dbfcb63333b24c684 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 3 Jul 2025 19:39:58 -0700 Subject: lib/crypto: x86/sha256: Remove unnecessary checks for nblocks==0 Since sha256_blocks() is called only with nblocks >= 1, remove unnecessary checks for nblocks == 0 from the x86 SHA-256 assembly code. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250704023958.73274-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha256-avx-asm.S | 3 --- lib/crypto/x86/sha256-avx2-asm.S | 1 - lib/crypto/x86/sha256-ni-asm.S | 3 --- lib/crypto/x86/sha256-ssse3-asm.S | 3 --- 4 files changed, 10 deletions(-) diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S index 798a7f07fa01..c1aceb3ba3a3 100644 --- a/lib/crypto/x86/sha256-avx-asm.S +++ b/lib/crypto/x86/sha256-avx-asm.S @@ -357,7 +357,6 @@ SYM_FUNC_START(sha256_transform_avx) and $~15, %rsp # align stack pointer shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, _INP_END(%rsp) @@ -446,8 +445,6 @@ SYM_FUNC_START(sha256_transform_avx) cmp _INP_END(%rsp), INP jne .Lloop0 -.Ldone_hash: - mov %rbp, %rsp popq %rbp popq %r15 diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S index 62a46993359e..eb8836fb9695 100644 --- a/lib/crypto/x86/sha256-avx2-asm.S +++ b/lib/crypto/x86/sha256-avx2-asm.S @@ -535,7 +535,6 @@ SYM_FUNC_START(sha256_transform_rorx) and $-32, %rsp # align rsp to 32 byte boundary shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block mov NUM_BLKS, _INP_END(%rsp) diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 9ebbacbb9c13..4bd9490ffc66 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -112,7 +112,6 @@ SYM_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash add DATA_PTR, NUM_BLKS /* pointer to end of data */ /* @@ -163,8 +162,6 @@ SYM_FUNC_START(sha256_ni_transform) movdqu STATE1, 0*16(STATE_PTR) movdqu STATE0, 1*16(STATE_PTR) -.Ldone_hash: - RET SYM_FUNC_END(sha256_ni_transform) diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S index 820fc8bbc29f..383b8eec7ebe 100644 --- a/lib/crypto/x86/sha256-ssse3-asm.S +++ b/lib/crypto/x86/sha256-ssse3-asm.S @@ -364,7 +364,6 @@ SYM_FUNC_START(sha256_transform_ssse3) and $~15, %rsp shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash add INP, NUM_BLKS mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data @@ -457,8 +456,6 @@ SYM_FUNC_START(sha256_transform_ssse3) cmp _INP_END(%rsp), INP jne .Lloop0 -.Ldone_hash: - mov %rbp, %rsp popq %rbp popq %r15 -- cgit v1.2.3 From aacb37f597d0b50800a233ab2c29c195a1595147 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 10:22:23 -0700 Subject: lib/crypto: hash_info: Move hash_info.c into lib/crypto/ crypto/hash_info.c just contains a couple of arrays that map HASH_ALGO_* algorithm IDs to properties of those algorithms. It is compiled only when CRYPTO_HASH_INFO=y, but currently CRYPTO_HASH_INFO depends on CRYPTO. Since this can be useful without the old-school crypto API, move it into lib/crypto/ so that it no longer depends on CRYPTO. This eliminates the need for FS_VERITY to select CRYPTO after it's been converted to use lib/crypto/. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630172224.46909-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 3 --- crypto/Makefile | 1 - crypto/hash_info.c | 63 -------------------------------------------------- lib/crypto/Kconfig | 3 +++ lib/crypto/Makefile | 2 ++ lib/crypto/hash_info.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 68 insertions(+), 67 deletions(-) delete mode 100644 crypto/hash_info.c create mode 100644 lib/crypto/hash_info.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 3ea1397214e0..5d4cf022c577 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1422,9 +1422,6 @@ config CRYPTO_USER_API_ENABLE_OBSOLETE endmenu -config CRYPTO_HASH_INFO - bool - if !KMSAN # avoid false positives from assembly if ARM source "arch/arm/crypto/Kconfig" diff --git a/crypto/Makefile b/crypto/Makefile index 5098fa6d5f39..816607e0e78c 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -204,7 +204,6 @@ obj-$(CONFIG_CRYPTO_ECRDSA) += ecrdsa_generic.o obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/ -obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o crypto_simd-y := simd.o obj-$(CONFIG_CRYPTO_SIMD) += crypto_simd.o diff --git a/crypto/hash_info.c b/crypto/hash_info.c deleted file mode 100644 index 9a467638c971..000000000000 --- a/crypto/hash_info.c +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Hash Info: Hash algorithms information - * - * Copyright (c) 2013 Dmitry Kasatkin - */ - -#include -#include - -const char *const hash_algo_name[HASH_ALGO__LAST] = { - [HASH_ALGO_MD4] = "md4", - [HASH_ALGO_MD5] = "md5", - [HASH_ALGO_SHA1] = "sha1", - [HASH_ALGO_RIPE_MD_160] = "rmd160", - [HASH_ALGO_SHA256] = "sha256", - [HASH_ALGO_SHA384] = "sha384", - [HASH_ALGO_SHA512] = "sha512", - [HASH_ALGO_SHA224] = "sha224", - [HASH_ALGO_RIPE_MD_128] = "rmd128", - [HASH_ALGO_RIPE_MD_256] = "rmd256", - [HASH_ALGO_RIPE_MD_320] = "rmd320", - [HASH_ALGO_WP_256] = "wp256", - [HASH_ALGO_WP_384] = "wp384", - [HASH_ALGO_WP_512] = "wp512", - [HASH_ALGO_TGR_128] = "tgr128", - [HASH_ALGO_TGR_160] = "tgr160", - [HASH_ALGO_TGR_192] = "tgr192", - [HASH_ALGO_SM3_256] = "sm3", - [HASH_ALGO_STREEBOG_256] = "streebog256", - [HASH_ALGO_STREEBOG_512] = "streebog512", - [HASH_ALGO_SHA3_256] = "sha3-256", - [HASH_ALGO_SHA3_384] = "sha3-384", - [HASH_ALGO_SHA3_512] = "sha3-512", -}; -EXPORT_SYMBOL_GPL(hash_algo_name); - -const int hash_digest_size[HASH_ALGO__LAST] = { - [HASH_ALGO_MD4] = MD5_DIGEST_SIZE, - [HASH_ALGO_MD5] = MD5_DIGEST_SIZE, - [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE, - [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE, - [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE, - [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE, - [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE, - [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE, - [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE, - [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE, - [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE, - [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE, - [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE, - [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE, - [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE, - [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, - [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, - [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE, - [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE, - [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE, - [HASH_ALGO_SHA3_256] = SHA3_256_DIGEST_SIZE, - [HASH_ALGO_SHA3_384] = SHA3_384_DIGEST_SIZE, - [HASH_ALGO_SHA3_512] = SHA3_512_DIGEST_SIZE, -}; -EXPORT_SYMBOL_GPL(hash_digest_size); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 3305c6908581..ee7e1e27ea6a 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -2,6 +2,9 @@ menu "Crypto library routines" +config CRYPTO_HASH_INFO + bool + config CRYPTO_LIB_UTILS tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index a887bf103bf0..533bb1533e19 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -8,6 +8,8 @@ quiet_cmd_perlasm = PERLASM $@ quiet_cmd_perlasm_with_args = PERLASM $@ cmd_perlasm_with_args = $(PERL) $(<) void $(@) +obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o + obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c new file mode 100644 index 000000000000..9a467638c971 --- /dev/null +++ b/lib/crypto/hash_info.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hash Info: Hash algorithms information + * + * Copyright (c) 2013 Dmitry Kasatkin + */ + +#include +#include + +const char *const hash_algo_name[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = "md4", + [HASH_ALGO_MD5] = "md5", + [HASH_ALGO_SHA1] = "sha1", + [HASH_ALGO_RIPE_MD_160] = "rmd160", + [HASH_ALGO_SHA256] = "sha256", + [HASH_ALGO_SHA384] = "sha384", + [HASH_ALGO_SHA512] = "sha512", + [HASH_ALGO_SHA224] = "sha224", + [HASH_ALGO_RIPE_MD_128] = "rmd128", + [HASH_ALGO_RIPE_MD_256] = "rmd256", + [HASH_ALGO_RIPE_MD_320] = "rmd320", + [HASH_ALGO_WP_256] = "wp256", + [HASH_ALGO_WP_384] = "wp384", + [HASH_ALGO_WP_512] = "wp512", + [HASH_ALGO_TGR_128] = "tgr128", + [HASH_ALGO_TGR_160] = "tgr160", + [HASH_ALGO_TGR_192] = "tgr192", + [HASH_ALGO_SM3_256] = "sm3", + [HASH_ALGO_STREEBOG_256] = "streebog256", + [HASH_ALGO_STREEBOG_512] = "streebog512", + [HASH_ALGO_SHA3_256] = "sha3-256", + [HASH_ALGO_SHA3_384] = "sha3-384", + [HASH_ALGO_SHA3_512] = "sha3-512", +}; +EXPORT_SYMBOL_GPL(hash_algo_name); + +const int hash_digest_size[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = MD5_DIGEST_SIZE, + [HASH_ALGO_MD5] = MD5_DIGEST_SIZE, + [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE, + [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE, + [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE, + [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE, + [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE, + [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE, + [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE, + [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE, + [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE, + [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, + [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, + [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE, + [HASH_ALGO_SHA3_256] = SHA3_256_DIGEST_SIZE, + [HASH_ALGO_SHA3_384] = SHA3_384_DIGEST_SIZE, + [HASH_ALGO_SHA3_512] = SHA3_512_DIGEST_SIZE, +}; +EXPORT_SYMBOL_GPL(hash_digest_size); -- cgit v1.2.3 From 52c3e242f4d0043186b70d65460ba1767f27494a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Jul 2025 16:10:57 -0700 Subject: lib/crypto: arm/poly1305: Fix register corruption in no-SIMD contexts Restore the SIMD usability check that was removed by commit 773426f4771b ("crypto: arm/poly1305 - Add block-only interface"). This safety check is cheap and is well worth eliminating a footgun. While the Poly1305 functions should not be called when SIMD registers are unusable, if they are anyway, they should just do the right thing instead of corrupting random tasks' registers and/or computing incorrect MACs. Fixing this is also needed for poly1305_kunit to pass. Just use may_use_simd() instead of the original crypto_simd_usable(), since poly1305_kunit won't rely on crypto_simd_disabled_for_test. Fixes: 773426f4771b ("crypto: arm/poly1305 - Add block-only interface") Cc: stable@vger.kernel.org Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250706231100.176113-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/poly1305-glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c index 2603b0771f2c..ca6dc5533705 100644 --- a/lib/crypto/arm/poly1305-glue.c +++ b/lib/crypto/arm/poly1305-glue.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -39,7 +40,7 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, { len = round_down(len, POLY1305_BLOCK_SIZE); if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { do { unsigned int todo = min_t(unsigned int, len, SZ_4K); -- cgit v1.2.3 From eec76ea5a7213c48529a46eed1b343e5cee3aaab Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Jul 2025 16:10:58 -0700 Subject: lib/crypto: arm64/poly1305: Fix register corruption in no-SIMD contexts Restore the SIMD usability check that was removed by commit a59e5468a921 ("crypto: arm64/poly1305 - Add block-only interface"). This safety check is cheap and is well worth eliminating a footgun. While the Poly1305 functions should not be called when SIMD registers are unusable, if they are anyway, they should just do the right thing instead of corrupting random tasks' registers and/or computing incorrect MACs. Fixing this is also needed for poly1305_kunit to pass. Just use may_use_simd() instead of the original crypto_simd_usable(), since poly1305_kunit won't rely on crypto_simd_disabled_for_test. Fixes: a59e5468a921 ("crypto: arm64/poly1305 - Add block-only interface") Cc: stable@vger.kernel.org Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250706231100.176113-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm64/poly1305-glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c index c9a74766785b..31aea21ce42f 100644 --- a/lib/crypto/arm64/poly1305-glue.c +++ b/lib/crypto/arm64/poly1305-glue.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -33,7 +34,7 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, unsigned int len, u32 padbit) { len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon)) { + if (static_branch_likely(&have_neon) && likely(may_use_simd())) { do { unsigned int todo = min_t(unsigned int, len, SZ_4K); -- cgit v1.2.3 From 16f2c30e290e04135b70ad374fb7e1d1ed9ff5e7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Jul 2025 16:10:59 -0700 Subject: lib/crypto: x86/poly1305: Fix register corruption in no-SIMD contexts Restore the SIMD usability check and base conversion that were removed by commit 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface"). This safety check is cheap and is well worth eliminating a footgun. While the Poly1305 functions should not be called when SIMD registers are unusable, if they are anyway, they should just do the right thing instead of corrupting random tasks' registers and/or computing incorrect MACs. Fixing this is also needed for poly1305_kunit to pass. Just use irq_fpu_usable() instead of the original crypto_simd_usable(), since poly1305_kunit won't rely on crypto_simd_disabled_for_test. Fixes: 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface") Cc: stable@vger.kernel.org Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250706231100.176113-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/poly1305_glue.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c index b7e78a583e07..968d84677631 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305_glue.c @@ -25,6 +25,42 @@ struct poly1305_arch_internal { struct { u32 r2, r1, r4, r3; } rn[9]; }; +/* + * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) +{ + struct poly1305_arch_internal *state = ctx; + u32 cy; + + if (!state->is_base2_26) + return; + + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; + /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */ +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; +} + asmlinkage void poly1305_block_init_arch( struct poly1305_block_state *state, const u8 raw_key[POLY1305_BLOCK_SIZE]); @@ -62,7 +98,9 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || SZ_4K % POLY1305_BLOCK_SIZE); - if (!static_branch_likely(&poly1305_use_avx)) { + if (!static_branch_likely(&poly1305_use_avx) || + unlikely(!irq_fpu_usable())) { + convert_to_base2_64(ctx); poly1305_blocks_x86_64(ctx, inp, len, padbit); return; } -- cgit v1.2.3 From 9f65592b7e1f24569bb6ced064df5b4319f725ce Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Jul 2025 16:11:00 -0700 Subject: lib/crypto: x86/poly1305: Fix performance regression on short messages Restore the len >= 288 condition on using the AVX implementation, which was incidentally removed by commit 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface"). This check took into account the overhead in key power computation, kernel-mode "FPU", and tail handling associated with the AVX code. Indeed, restoring this check slightly improves performance for len < 256 as measured using poly1305_kunit on an "AMD Ryzen AI 9 365" (Zen 5) CPU: Length Before After ====== ========== ========== 1 30 MB/s 36 MB/s 16 516 MB/s 598 MB/s 64 1700 MB/s 1882 MB/s 127 2265 MB/s 2651 MB/s 128 2457 MB/s 2827 MB/s 200 2702 MB/s 3238 MB/s 256 3841 MB/s 3768 MB/s 511 4580 MB/s 4585 MB/s 512 5430 MB/s 5398 MB/s 1024 7268 MB/s 7305 MB/s 3173 8999 MB/s 8948 MB/s 4096 9942 MB/s 9921 MB/s 16384 10557 MB/s 10545 MB/s While the optimal threshold for this CPU might be slightly lower than 288 (see the len == 256 case), other CPUs would need to be tested too, and these sorts of benchmarks can underestimate the true cost of kernel-mode "FPU". Therefore, for now just restore the 288 threshold. Fixes: 318c53ae02f2 ("crypto: x86/poly1305 - Add block-only interface") Cc: stable@vger.kernel.org Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250706231100.176113-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/poly1305_glue.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c index 968d84677631..856d48fd422b 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305_glue.c @@ -98,7 +98,15 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || SZ_4K % POLY1305_BLOCK_SIZE); + /* + * The AVX implementations have significant setup overhead (e.g. key + * power computation, kernel FPU enabling) which makes them slower for + * short messages. Fall back to the scalar implementation for messages + * shorter than 288 bytes, unless the AVX-specific key setup has already + * been performed (indicated by ctx->is_base2_26). + */ if (!static_branch_likely(&poly1305_use_avx) || + (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) || unlikely(!irq_fpu_usable())) { convert_to_base2_64(ctx); poly1305_blocks_x86_64(ctx, inp, len, padbit); -- cgit v1.2.3 From 6e07c5e166597de1d7943ecf2539cad18c0e2ce1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 11 Jul 2025 14:28:22 -0700 Subject: lib/crypto: arm/poly1305: Remove unneeded empty weak function Fix poly1305-armv4.pl to not do '.globl poly1305_blocks_neon' when poly1305_blocks_neon() is not defined. Then, remove the empty __weak definition of poly1305_blocks_neon(), which was still needed only because of that unnecessary globl statement. (It also used to be needed because the compiler could generate calls to it when CONFIG_KERNEL_MODE_NEON=n, but that has been fixed.) Thanks to Arnd Bergmann for reporting that the globl statement in the asm file was still depending on the weak symbol. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250711212822.6372-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm/poly1305-armv4.pl | 2 +- lib/crypto/arm/poly1305-glue.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl index d57c6e2fc84a..dd7a996361a7 100644 --- a/lib/crypto/arm/poly1305-armv4.pl +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -46,7 +46,6 @@ $code.=<<___; # define poly1305_init poly1305_block_init_arch # define poly1305_blocks poly1305_blocks_arm # define poly1305_emit poly1305_emit_arch -.globl poly1305_blocks_neon #endif #if defined(__thumb2__) @@ -722,6 +721,7 @@ poly1305_init_neon: ret @ bx lr .size poly1305_init_neon,.-poly1305_init_neon +.globl poly1305_blocks_neon .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c index ca6dc5533705..2d86c78af883 100644 --- a/lib/crypto/arm/poly1305-glue.c +++ b/lib/crypto/arm/poly1305-glue.c @@ -28,11 +28,6 @@ asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, const u32 nonce[4]); EXPORT_SYMBOL_GPL(poly1305_emit_arch); -void __weak poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit) -{ -} - static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, -- cgit v1.2.3 From 7941ad696506917fa6228f44be2df0c2f0909a62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 11 Jul 2025 14:58:43 -0700 Subject: lib/crypto: sha2: Add hmac_sha*_init_usingrawkey() While the HMAC library functions support both incremental and one-shot computation and both prepared and raw keys, the combination of raw key + incremental was missing. It turns out that several potential users of the HMAC library functions (tpm2-sessions.c, smb2transport.c, trusted_tpm1.c) want exactly that. Therefore, add the missing functions hmac_sha*_init_usingrawkey(). Implement them in an optimized way that directly initializes the HMAC context without a separate key preparation step. Reimplement the one-shot raw key functions hmac_sha*_usingrawkey() on top of the new functions, which makes them a bit more efficient. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250711215844.41715-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/crypto/sha256.c | 53 ++++++++++++++++++++++++++++++--------------- lib/crypto/sha512.c | 55 +++++++++++++++++++++++++++++++--------------- 3 files changed, 134 insertions(+), 34 deletions(-) diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index e0a08f6addd0..15e461e568cc 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -247,6 +247,21 @@ static inline void hmac_sha224_init(struct hmac_sha224_ctx *ctx, __hmac_sha256_init(&ctx->ctx, &key->key); } +/** + * hmac_sha224_init_usingrawkey() - Initialize an HMAC-SHA224 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-SHA224 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_sha224_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + /** * hmac_sha224_update() - Update an HMAC-SHA224 context with message data * @ctx: the HMAC context to update; must have been initialized @@ -405,6 +420,21 @@ static inline void hmac_sha256_init(struct hmac_sha256_ctx *ctx, __hmac_sha256_init(&ctx->ctx, &key->key); } +/** + * hmac_sha256_init_usingrawkey() - Initialize an HMAC-SHA256 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-SHA256 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_sha256_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + /** * hmac_sha256_update() - Update an HMAC-SHA256 context with message data * @ctx: the HMAC context to update; must have been initialized @@ -597,6 +627,21 @@ static inline void hmac_sha384_init(struct hmac_sha384_ctx *ctx, __hmac_sha512_init(&ctx->ctx, &key->key); } +/** + * hmac_sha384_init_usingrawkey() - Initialize an HMAC-SHA384 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-SHA384 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_sha384_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + /** * hmac_sha384_update() - Update an HMAC-SHA384 context with message data * @ctx: the HMAC context to update; must have been initialized @@ -755,6 +800,21 @@ static inline void hmac_sha512_init(struct hmac_sha512_ctx *ctx, __hmac_sha512_init(&ctx->ctx, &key->key); } +/** + * hmac_sha512_init_usingrawkey() - Initialize an HMAC-SHA512 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-SHA512 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_sha512_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + /** * hmac_sha512_update() - Update an HMAC-SHA512 context with message data * @ctx: the HMAC context to update; must have been initialized diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 808438d4f427..8fa15165d23e 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -263,7 +263,8 @@ EXPORT_SYMBOL(sha256); /* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ #ifndef __DISABLE_EXPORTS -static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key, +static void __hmac_sha256_preparekey(struct sha256_block_state *istate, + struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, const struct sha256_block_state *iv) { @@ -283,14 +284,14 @@ static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key, for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); - key->istate = *iv; - sha256_blocks(&key->istate, derived_key.b, 1); + *istate = *iv; + sha256_blocks(istate, derived_key.b, 1); for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ HMAC_IPAD_VALUE); - key->ostate = *iv; - sha256_blocks(&key->ostate, derived_key.b, 1); + *ostate = *iv; + sha256_blocks(ostate, derived_key.b, 1); memzero_explicit(&derived_key, sizeof(derived_key)); } @@ -298,14 +299,16 @@ static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key, void hmac_sha224_preparekey(struct hmac_sha224_key *key, const u8 *raw_key, size_t raw_key_len) { - __hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha224_iv); + __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha224_iv); } EXPORT_SYMBOL_GPL(hmac_sha224_preparekey); void hmac_sha256_preparekey(struct hmac_sha256_key *key, const u8 *raw_key, size_t raw_key_len) { - __hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha256_iv); + __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha256_iv); } EXPORT_SYMBOL_GPL(hmac_sha256_preparekey); @@ -317,6 +320,24 @@ void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx, } EXPORT_SYMBOL_GPL(__hmac_sha256_init); +void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha224_iv); + ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey); + +void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha256_iv); + ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey); + static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx, u8 *out, size_t digest_size) { @@ -376,12 +397,11 @@ void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]) { - struct hmac_sha224_key key; - - hmac_sha224_preparekey(&key, raw_key, raw_key_len); - hmac_sha224(&key, data, data_len, out); + struct hmac_sha224_ctx ctx; - memzero_explicit(&key, sizeof(key)); + hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha224_update(&ctx, data, data_len); + hmac_sha224_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey); @@ -389,12 +409,11 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]) { - struct hmac_sha256_key key; - - hmac_sha256_preparekey(&key, raw_key, raw_key_len); - hmac_sha256(&key, data, data_len, out); + struct hmac_sha256_ctx ctx; - memzero_explicit(&key, sizeof(key)); + hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha256_update(&ctx, data, data_len); + hmac_sha256_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey); #endif /* !__DISABLE_EXPORTS */ diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c index d514721491ca..d8062188be98 100644 --- a/lib/crypto/sha512.c +++ b/lib/crypto/sha512.c @@ -249,7 +249,8 @@ void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]) } EXPORT_SYMBOL_GPL(sha512); -static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key, +static void __hmac_sha512_preparekey(struct sha512_block_state *istate, + struct sha512_block_state *ostate, const u8 *raw_key, size_t raw_key_len, const struct sha512_block_state *iv) { @@ -269,14 +270,14 @@ static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key, for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); - key->istate = *iv; - sha512_blocks(&key->istate, derived_key.b, 1); + *istate = *iv; + sha512_blocks(istate, derived_key.b, 1); for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ HMAC_IPAD_VALUE); - key->ostate = *iv; - sha512_blocks(&key->ostate, derived_key.b, 1); + *ostate = *iv; + sha512_blocks(ostate, derived_key.b, 1); memzero_explicit(&derived_key, sizeof(derived_key)); } @@ -284,14 +285,16 @@ static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key, void hmac_sha384_preparekey(struct hmac_sha384_key *key, const u8 *raw_key, size_t raw_key_len) { - __hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha384_iv); + __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha384_iv); } EXPORT_SYMBOL_GPL(hmac_sha384_preparekey); void hmac_sha512_preparekey(struct hmac_sha512_key *key, const u8 *raw_key, size_t raw_key_len) { - __hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha512_iv); + __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate, + raw_key, raw_key_len, &sha512_iv); } EXPORT_SYMBOL_GPL(hmac_sha512_preparekey); @@ -303,6 +306,26 @@ void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx, } EXPORT_SYMBOL_GPL(__hmac_sha512_init); +void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha384_iv); + ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE; + ctx->ctx.sha_ctx.bytecount_hi = 0; +} +EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey); + +void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, + raw_key, raw_key_len, &sha512_iv); + ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE; + ctx->ctx.sha_ctx.bytecount_hi = 0; +} +EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey); + static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx, u8 *out, size_t digest_size) { @@ -362,12 +385,11 @@ void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]) { - struct hmac_sha384_key key; - - hmac_sha384_preparekey(&key, raw_key, raw_key_len); - hmac_sha384(&key, data, data_len, out); + struct hmac_sha384_ctx ctx; - memzero_explicit(&key, sizeof(key)); + hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha384_update(&ctx, data, data_len); + hmac_sha384_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey); @@ -375,12 +397,11 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]) { - struct hmac_sha512_key key; - - hmac_sha512_preparekey(&key, raw_key, raw_key_len); - hmac_sha512(&key, data, data_len, out); + struct hmac_sha512_ctx ctx; - memzero_explicit(&key, sizeof(key)); + hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha512_update(&ctx, data, data_len); + hmac_sha512_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey); -- cgit v1.2.3 From 56119446f89fbab40f3a160e9c3de33687cd85cf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:52 -0700 Subject: crypto: x86/sha1 - Rename conflicting symbol Rename x86's sha1_update() to sha1_update_x86(), since it conflicts with the upcoming sha1_update() library function. Note: the affected code will be superseded by later commits that migrate the arch-optimized SHA-1 code into the library. This commit simply keeps the kernel building for the initial introduction of the library. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/crypto/sha1_ssse3_glue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 0a912bfc86c5..826579a7473c 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -34,7 +34,7 @@ static const struct x86_cpu_id module_cpu_ids[] = { }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); -static inline int sha1_update(struct shash_desc *desc, const u8 *data, +static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data, unsigned int len, sha1_block_fn *sha1_xform) { int remain; @@ -69,7 +69,7 @@ asmlinkage void sha1_transform_ssse3(struct sha1_state *state, static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, sha1_transform_ssse3); + return sha1_update_x86(desc, data, len, sha1_transform_ssse3); } static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, @@ -113,7 +113,7 @@ asmlinkage void sha1_transform_avx(struct sha1_state *state, static int sha1_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, sha1_transform_avx); + return sha1_update_x86(desc, data, len, sha1_transform_avx); } static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, @@ -190,7 +190,7 @@ static inline void sha1_apply_transform_avx2(struct sha1_state *state, static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, sha1_apply_transform_avx2); + return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2); } static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, @@ -234,7 +234,7 @@ asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, static int sha1_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, sha1_ni_transform); + return sha1_update_x86(desc, data, len, sha1_ni_transform); } static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, -- cgit v1.2.3 From 9503ca2ccafec51ee9e533d6f3aef14a589fc106 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:53 -0700 Subject: lib/crypto: sha1: Rename sha1_init() to sha1_init_raw() Rename the existing sha1_init() to sha1_init_raw(), since it conflicts with the upcoming library function. This will later be removed, but this keeps the kernel building for the introduction of the library. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha1.h | 2 +- kernel/bpf/core.c | 2 +- lib/crypto/sha1.c | 6 +++--- net/ipv6/addrconf.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h index f48230b1413c..d853d3b93169 100644 --- a/include/crypto/sha1.h +++ b/include/crypto/sha1.h @@ -33,7 +33,7 @@ struct sha1_state { */ #define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / 4) #define SHA1_WORKSPACE_WORDS 16 -void sha1_init(__u32 *buf); +void sha1_init_raw(__u32 *buf); void sha1_transform(__u32 *digest, const char *data, __u32 *W); #endif /* _CRYPTO_SHA1_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c20babbf998f..dae281a1286d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -304,7 +304,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha1_init(digest); + sha1_init_raw(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index 6d809c3088be..813ad96daa25 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -124,10 +124,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array) EXPORT_SYMBOL(sha1_transform); /** - * sha1_init - initialize the vectors for a SHA1 digest + * sha1_init_raw - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ -void sha1_init(__u32 *buf) +void sha1_init_raw(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; @@ -135,7 +135,7 @@ void sha1_init(__u32 *buf) buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } -EXPORT_SYMBOL(sha1_init); +EXPORT_SYMBOL(sha1_init_raw); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ba2ec7c870cc..d0e5b94c10af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3367,7 +3367,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha1_init(digest); + sha1_init_raw(digest); memset(&data, 0, sizeof(data)); memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); -- cgit v1.2.3 From 90860aef630c5c9e58d05044f2866fcbfa7aa4d9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:54 -0700 Subject: lib/crypto: sha1: Add SHA-1 library functions Add a library interface for SHA-1, following the SHA-2 one. As was the case with SHA-2, this will be useful for various in-kernel users. The crypto_shash interface will be reimplemented on top of it as well. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha1.h | 60 ++++++++++++++++++++++++++ lib/crypto/Kconfig | 7 +++ lib/crypto/Makefile | 9 +++- lib/crypto/sha1.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 184 insertions(+), 7 deletions(-) diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h index d853d3b93169..387f6123a05e 100644 --- a/include/crypto/sha1.h +++ b/include/crypto/sha1.h @@ -36,4 +36,64 @@ struct sha1_state { void sha1_init_raw(__u32 *buf); void sha1_transform(__u32 *digest, const char *data, __u32 *W); +/* State for the SHA-1 compression function */ +struct sha1_block_state { + u32 h[SHA1_DIGEST_SIZE / 4]; +}; + +/** + * struct sha1_ctx - Context for hashing a message with SHA-1 + * @state: the compression function state + * @bytecount: number of bytes processed so far + * @buf: partial block buffer; bytecount % SHA1_BLOCK_SIZE bytes are valid + */ +struct sha1_ctx { + struct sha1_block_state state; + u64 bytecount; + u8 buf[SHA1_BLOCK_SIZE]; +}; + +/** + * sha1_init() - Initialize a SHA-1 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider sha1() instead. + * + * Context: Any context. + */ +void sha1_init(struct sha1_ctx *ctx); + +/** + * sha1_update() - Update a SHA-1 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len); + +/** + * sha1_final() - Finish computing a SHA-1 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting SHA-1 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); + +/** + * sha1() - Compute SHA-1 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting SHA-1 message digest + * + * Context: Any context. + */ +void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]); + #endif /* _CRYPTO_SHA1_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index ee7e1e27ea6a..5aaf484fc9de 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -139,6 +139,13 @@ config CRYPTO_LIB_CHACHA20POLY1305 config CRYPTO_LIB_SHA1 tristate + help + The SHA-1 library functions. Select this if your module uses any of + the functions from . + +config CRYPTO_LIB_SHA1_ARCH + bool + depends on CRYPTO_LIB_SHA1 && !UML config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 533bb1533e19..0eb0906d693f 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -65,8 +65,13 @@ libpoly1305-generic-y := poly1305-donna32.o libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o libpoly1305-generic-y += poly1305-generic.o -obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o -libsha1-y := sha1.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o +libsha1-y := sha1.o +ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y) +CFLAGS_sha1.o += -I$(src)/$(SRCARCH) +endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index 813ad96daa25..0fe9ca8d0653 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SHA1 routine optimized to do word accesses rather than byte accesses, - * and to avoid unnecessary copies into the context array. - * - * This was based on the git SHA1 implementation. + * SHA-1 library functions */ #include @@ -14,6 +11,10 @@ #include #include +static const struct sha1_block_state sha1_iv = { + .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, +}; + /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on @@ -137,5 +138,109 @@ void sha1_init_raw(__u32 *buf) } EXPORT_SYMBOL(sha1_init_raw); -MODULE_DESCRIPTION("SHA-1 Algorithm"); +static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + u32 workspace[SHA1_WORKSPACE_WORDS]; + + do { + sha1_transform(state->h, data, workspace); + data += SHA1_BLOCK_SIZE; + } while (--nblocks); + + memzero_explicit(workspace, sizeof(workspace)); +} + +#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH +#include "sha1.h" /* $(SRCARCH)/sha1.h */ +#else +#define sha1_blocks sha1_blocks_generic +#endif + +void sha1_init(struct sha1_ctx *ctx) +{ + ctx->state = sha1_iv; + ctx->bytecount = 0; +} +EXPORT_SYMBOL_GPL(sha1_init); + +void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= SHA1_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = SHA1_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + sha1_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / SHA1_BLOCK_SIZE; + len %= SHA1_BLOCK_SIZE; + + if (nblocks) { + sha1_blocks(&ctx->state, data, nblocks); + data += nblocks * SHA1_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(sha1_update); + +void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > SHA1_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial); + sha1_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial); + *(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount); + sha1_blocks(&ctx->state, ctx->buf, 1); + + for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4) + put_unaligned_be32(ctx->state.h[i / 4], out + i); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(sha1_final); + +void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]) +{ + struct sha1_ctx ctx; + + sha1_init(&ctx); + sha1_update(&ctx, data, len); + sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(sha1); + +#ifdef sha1_mod_init_arch +static int __init sha1_mod_init(void) +{ + sha1_mod_init_arch(); + return 0; +} +subsys_initcall(sha1_mod_init); + +static void __exit sha1_mod_exit(void) +{ +} +module_exit(sha1_mod_exit); +#endif + +MODULE_DESCRIPTION("SHA-1 library functions"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 4cbc84471bb606ddfaf424709dd8d56b56d7ae7b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:55 -0700 Subject: lib/crypto: sha1: Add HMAC support Add HMAC support to the SHA-1 library, again following what was done for SHA-2. Besides providing the basis for a more streamlined "hmac(sha1)" shash, this will also be useful for multiple in-kernel users such as net/sctp/auth.c, net/ipv6/seg6_hmac.c, and security/keys/trusted-keys/trusted_tpm1.c. Those are currently using crypto_shash, but using the library functions would be much simpler. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha1.h | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/crypto/sha1.c | 108 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 223 insertions(+), 3 deletions(-) diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h index 387f6123a05e..162a529ec841 100644 --- a/include/crypto/sha1.h +++ b/include/crypto/sha1.h @@ -96,4 +96,122 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); */ void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]); +/** + * struct hmac_sha1_key - Prepared key for HMAC-SHA1 + * @istate: private + * @ostate: private + */ +struct hmac_sha1_key { + struct sha1_block_state istate; + struct sha1_block_state ostate; +}; + +/** + * struct hmac_sha1_ctx - Context for computing HMAC-SHA1 of a message + * @sha_ctx: private + * @ostate: private + */ +struct hmac_sha1_ctx { + struct sha1_ctx sha_ctx; + struct sha1_block_state ostate; +}; + +/** + * hmac_sha1_preparekey() - Prepare a key for HMAC-SHA1 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-SHA1 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_sha1_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_sha1_preparekey(struct hmac_sha1_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha1_init() - Initialize an HMAC-SHA1 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_sha1() instead. + * + * Context: Any context. + */ +void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key); + +/** + * hmac_sha1_init_usingrawkey() - Initialize an HMAC-SHA1 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-SHA1 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_sha1_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_sha1_update() - Update an HMAC-SHA1 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx, + const u8 *data, size_t data_len) +{ + sha1_update(&ctx->sha_ctx, data, data_len); +} + +/** + * hmac_sha1_final() - Finish computing an HMAC-SHA1 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-SHA1 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); + +/** + * hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA1 value + * + * If you're using the key only once, consider using hmac_sha1_usingrawkey(). + * + * Context: Any context. + */ +void hmac_sha1(const struct hmac_sha1_key *key, + const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]); + +/** + * hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key + * @raw_key: the raw HMAC-SHA1 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-SHA1 value + * + * If you're using the key multiple times, prefer to use hmac_sha1_preparekey() + * followed by multiple calls to hmac_sha1() instead. + * + * Context: Any context. + */ +void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA1_DIGEST_SIZE]); + #endif /* _CRYPTO_SHA1_H */ diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index 0fe9ca8d0653..5904e4ae85d2 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SHA-1 library functions + * SHA-1 and HMAC-SHA1 library functions */ +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include static const struct sha1_block_state sha1_iv = { .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, @@ -197,7 +199,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len) } EXPORT_SYMBOL_GPL(sha1_update); -void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) { u64 bitcount = ctx->bytecount << 3; size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE; @@ -214,6 +216,11 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4) put_unaligned_be32(ctx->state.h[i / 4], out + i); +} + +void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + __sha1_final(ctx, out); memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL_GPL(sha1_final); @@ -228,6 +235,101 @@ void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]) } EXPORT_SYMBOL_GPL(sha1); +static void __hmac_sha1_preparekey(struct sha1_block_state *istate, + struct sha1_block_state *ostate, + const u8 *raw_key, size_t raw_key_len) +{ + union { + u8 b[SHA1_BLOCK_SIZE]; + unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > SHA1_BLOCK_SIZE)) + sha1(raw_key, raw_key_len, derived_key.b); + else + memcpy(derived_key.b, raw_key, raw_key_len); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = sha1_iv; + sha1_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = sha1_iv; + sha1_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_sha1_preparekey(struct hmac_sha1_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha1_preparekey(&key->istate, &key->ostate, + raw_key, raw_key_len); +} +EXPORT_SYMBOL_GPL(hmac_sha1_preparekey); + +void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key) +{ + ctx->sha_ctx.state = key->istate; + ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE; + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(hmac_sha1_init); + +void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate, + raw_key, raw_key_len); + ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey); + +void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]) +{ + /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ + __sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf); + memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0, + SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE); + ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80; + *(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] = + cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE)); + + /* Compute the outer hash, which gives the HMAC value. */ + sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); + for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4) + put_unaligned_be32(ctx->ostate.h[i / 4], out + i); + + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(hmac_sha1_final); + +void hmac_sha1(const struct hmac_sha1_key *key, + const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]) +{ + struct hmac_sha1_ctx ctx; + + hmac_sha1_init(&ctx, key); + hmac_sha1_update(&ctx, data, data_len); + hmac_sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha1); + +void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[SHA1_DIGEST_SIZE]) +{ + struct hmac_sha1_ctx ctx; + + hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_sha1_update(&ctx, data, data_len); + hmac_sha1_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey); + #ifdef sha1_mod_init_arch static int __init sha1_mod_init(void) { @@ -242,5 +344,5 @@ static void __exit sha1_mod_exit(void) module_exit(sha1_mod_exit); #endif -MODULE_DESCRIPTION("SHA-1 library functions"); +MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8bc79ab67d78e2991b9d6cf0b63789189212375a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:56 -0700 Subject: crypto: sha1 - Wrap library and add HMAC support Like I did for crypto/sha512.c, rework crypto/sha1_generic.c (renamed to crypto/sha1.c) to simply wrap the normal library functions instead of accessing the low-level block function directly. Also add support for HMAC-SHA1, again just wrapping the library functions. Since the replacement crypto_shash algorithms are implemented using the (potentially arch-optimized) library functions, give them driver names ending with "-lib" rather than "-generic". Update crypto/testmgr.c and an odd driver to take this change in driver name into account. Note: to see the diff from crypto/sha1_generic.c to crypto/sha1.c, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 3 +- crypto/Makefile | 2 +- crypto/sha1.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++ crypto/sha1_generic.c | 87 ------------------------------ crypto/testmgr.c | 6 +++ drivers/crypto/img-hash.c | 2 +- 6 files changed, 145 insertions(+), 90 deletions(-) create mode 100644 crypto/sha1.c delete mode 100644 crypto/sha1_generic.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 5d4cf022c577..23bd98981ae8 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -986,7 +986,8 @@ config CRYPTO_SHA1 select CRYPTO_HASH select CRYPTO_LIB_SHA1 help - SHA-1 secure hash algorithm (FIPS 180, ISO/IEC 10118-3) + SHA-1 secure hash algorithm (FIPS 180, ISO/IEC 10118-3), including + HMAC support. config CRYPTO_SHA256 tristate "SHA-224 and SHA-256" diff --git a/crypto/Makefile b/crypto/Makefile index 816607e0e78c..9110f708e5a9 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -75,7 +75,7 @@ obj-$(CONFIG_CRYPTO_NULL) += crypto_null.o obj-$(CONFIG_CRYPTO_MD4) += md4.o obj-$(CONFIG_CRYPTO_MD5) += md5.o obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o -obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o +obj-$(CONFIG_CRYPTO_SHA1) += sha1.o obj-$(CONFIG_CRYPTO_SHA256) += sha256.o obj-$(CONFIG_CRYPTO_SHA512) += sha512.o obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o diff --git a/crypto/sha1.c b/crypto/sha1.c new file mode 100644 index 000000000000..00e273b0401d --- /dev/null +++ b/crypto/sha1.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Crypto API support for SHA-1 and HMAC-SHA1 + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright 2025 Google LLC + */ +#include +#include +#include +#include + +const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { + 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, + 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, + 0xaf, 0xd8, 0x07, 0x09 +}; +EXPORT_SYMBOL_GPL(sha1_zero_message_hash); + +#define SHA1_CTX(desc) ((struct sha1_ctx *)shash_desc_ctx(desc)) + +static int crypto_sha1_init(struct shash_desc *desc) +{ + sha1_init(SHA1_CTX(desc)); + return 0; +} + +static int crypto_sha1_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + sha1_update(SHA1_CTX(desc), data, len); + return 0; +} + +static int crypto_sha1_final(struct shash_desc *desc, u8 *out) +{ + sha1_final(SHA1_CTX(desc), out); + return 0; +} + +static int crypto_sha1_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + sha1(data, len, out); + return 0; +} + +#define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm)) +#define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_sha1_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) +{ + hmac_sha1_preparekey(HMAC_SHA1_KEY(tfm), raw_key, keylen); + return 0; +} + +static int crypto_hmac_sha1_init(struct shash_desc *desc) +{ + hmac_sha1_init(HMAC_SHA1_CTX(desc), HMAC_SHA1_KEY(desc->tfm)); + return 0; +} + +static int crypto_hmac_sha1_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + hmac_sha1_update(HMAC_SHA1_CTX(desc), data, len); + return 0; +} + +static int crypto_hmac_sha1_final(struct shash_desc *desc, u8 *out) +{ + hmac_sha1_final(HMAC_SHA1_CTX(desc), out); + return 0; +} + +static int crypto_hmac_sha1_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + hmac_sha1(HMAC_SHA1_KEY(desc->tfm), data, len, out); + return 0; +} + +static struct shash_alg algs[] = { + { + .base.cra_name = "sha1", + .base.cra_driver_name = "sha1-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA1_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .digestsize = SHA1_DIGEST_SIZE, + .init = crypto_sha1_init, + .update = crypto_sha1_update, + .final = crypto_sha1_final, + .digest = crypto_sha1_digest, + .descsize = sizeof(struct sha1_ctx), + }, + { + .base.cra_name = "hmac(sha1)", + .base.cra_driver_name = "hmac-sha1-lib", + .base.cra_priority = 300, + .base.cra_blocksize = SHA1_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_sha1_key), + .base.cra_module = THIS_MODULE, + .digestsize = SHA1_DIGEST_SIZE, + .setkey = crypto_hmac_sha1_setkey, + .init = crypto_hmac_sha1_init, + .update = crypto_hmac_sha1_update, + .final = crypto_hmac_sha1_final, + .digest = crypto_hmac_sha1_digest, + .descsize = sizeof(struct hmac_sha1_ctx), + }, +}; + +static int __init crypto_sha1_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} +module_init(crypto_sha1_mod_init); + +static void __exit crypto_sha1_mod_exit(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} +module_exit(crypto_sha1_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Crypto API support for SHA-1 and HMAC-SHA1"); + +MODULE_ALIAS_CRYPTO("sha1"); +MODULE_ALIAS_CRYPTO("sha1-lib"); +MODULE_ALIAS_CRYPTO("hmac(sha1)"); +MODULE_ALIAS_CRYPTO("hmac-sha1-lib"); diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c deleted file mode 100644 index 024e8043bab0..000000000000 --- a/crypto/sha1_generic.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * SHA1 Secure Hash Algorithm. - * - * Derived from cryptoapi implementation, adapted for in-place - * scatterlist interface. - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - */ -#include -#include -#include -#include -#include -#include - -const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { - 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, - 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, - 0xaf, 0xd8, 0x07, 0x09 -}; -EXPORT_SYMBOL_GPL(sha1_zero_message_hash); - -static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, - int blocks) -{ - u32 temp[SHA1_WORKSPACE_WORDS]; - - while (blocks--) { - sha1_transform(sst->state, src, temp); - src += SHA1_BLOCK_SIZE; - } - memzero_explicit(temp, sizeof(temp)); -} - -static int crypto_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_base_do_update_blocks(desc, data, len, - sha1_generic_block_fn); -} - -static int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha1_base_do_finup(desc, data, len, sha1_generic_block_fn); - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = crypto_sha1_update, - .finup = crypto_sha1_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_generic_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_generic_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(sha1_generic_mod_init); -module_exit(sha1_generic_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 4e95567f7ed1..be78e3930769 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4237,6 +4237,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(sha1),cbc(aes))", + .generic_driver = "authenc(hmac-sha1-lib,cbc(aes-generic))", .test = alg_test_aead, .fips_allowed = 1, .suite = { @@ -4244,12 +4245,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(sha1),cbc(des))", + .generic_driver = "authenc(hmac-sha1-lib,cbc(des-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha1_des_cbc_tv_temp) } }, { .alg = "authenc(hmac(sha1),cbc(des3_ede))", + .generic_driver = "authenc(hmac-sha1-lib,cbc(des3_ede-generic))", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha1_des3_ede_cbc_tv_temp) @@ -4260,6 +4263,7 @@ static const struct alg_test_desc alg_test_descs[] = { .fips_allowed = 1, }, { .alg = "authenc(hmac(sha1),ecb(cipher_null))", + .generic_driver = "authenc(hmac-sha1-lib,ecb-cipher_null)", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_sha1_ecb_cipher_null_tv_temp) @@ -5122,6 +5126,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(sha1)", + .generic_driver = "hmac-sha1-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { @@ -5462,6 +5467,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "sha1", + .generic_driver = "sha1-lib", .test = alg_test_hash, .fips_allowed = 1, .suite = { diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index f312eb075fec..a8f735390f0d 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -705,7 +705,7 @@ static int img_hash_cra_md5_init(struct crypto_tfm *tfm) static int img_hash_cra_sha1_init(struct crypto_tfm *tfm) { - return img_hash_cra_init(tfm, "sha1-generic"); + return img_hash_cra_init(tfm, "sha1-lib"); } static int img_hash_cra_sha224_init(struct crypto_tfm *tfm) -- cgit v1.2.3 From b10a74abcfc5c61afc63a567c457038be57eeb6e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:57 -0700 Subject: crypto: sha1 - Use same state format as legacy drivers Same as sha256 and sha512: Use the state format that the generic partial block handling code produces, as requested by Herbert, even though this is applicable only to legacy drivers. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/sha1.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/crypto/sha1.c b/crypto/sha1.c index 00e273b0401d..ecef4bf2d9c0 100644 --- a/crypto/sha1.c +++ b/crypto/sha1.c @@ -12,6 +12,43 @@ #include #include +/* + * Export and import functions. crypto_shash wants a particular format that + * matches that used by some legacy drivers. It currently is the same as the + * library SHA context, except the value in bytecount must be block-aligned and + * the remainder must be stored in an extra u8 appended to the struct. + */ + +#define SHA1_SHASH_STATE_SIZE (sizeof(struct sha1_ctx) + 1) +static_assert(sizeof(struct sha1_ctx) == sizeof(struct sha1_state)); +static_assert(offsetof(struct sha1_ctx, state) == offsetof(struct sha1_state, state)); +static_assert(offsetof(struct sha1_ctx, bytecount) == offsetof(struct sha1_state, count)); +static_assert(offsetof(struct sha1_ctx, buf) == offsetof(struct sha1_state, buffer)); + +static int __crypto_sha1_export(const struct sha1_ctx *ctx0, void *out) +{ + struct sha1_ctx ctx = *ctx0; + unsigned int partial; + u8 *p = out; + + partial = ctx.bytecount % SHA1_BLOCK_SIZE; + ctx.bytecount -= partial; + memcpy(p, &ctx, sizeof(ctx)); + p += sizeof(ctx); + *p = partial; + return 0; +} + +static int __crypto_sha1_import(struct sha1_ctx *ctx, const void *in) +{ + const u8 *p = in; + + memcpy(ctx, p, sizeof(*ctx)); + p += sizeof(*ctx); + ctx->bytecount += *p; + return 0; +} + const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, @@ -47,6 +84,16 @@ static int crypto_sha1_digest(struct shash_desc *desc, return 0; } +static int crypto_sha1_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha1_export(SHA1_CTX(desc), out); +} + +static int crypto_sha1_import(struct shash_desc *desc, const void *in) +{ + return __crypto_sha1_import(SHA1_CTX(desc), in); +} + #define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc)) @@ -83,6 +130,19 @@ static int crypto_hmac_sha1_digest(struct shash_desc *desc, return 0; } +static int crypto_hmac_sha1_export(struct shash_desc *desc, void *out) +{ + return __crypto_sha1_export(&HMAC_SHA1_CTX(desc)->sha_ctx, out); +} + +static int crypto_hmac_sha1_import(struct shash_desc *desc, const void *in) +{ + struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc); + + ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate; + return __crypto_sha1_import(&ctx->sha_ctx, in); +} + static struct shash_alg algs[] = { { .base.cra_name = "sha1", @@ -95,7 +155,10 @@ static struct shash_alg algs[] = { .update = crypto_sha1_update, .final = crypto_sha1_final, .digest = crypto_sha1_digest, + .export = crypto_sha1_export, + .import = crypto_sha1_import, .descsize = sizeof(struct sha1_ctx), + .statesize = SHA1_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha1)", @@ -110,7 +173,10 @@ static struct shash_alg algs[] = { .update = crypto_hmac_sha1_update, .final = crypto_hmac_sha1_final, .digest = crypto_hmac_sha1_digest, + .export = crypto_hmac_sha1_export, + .import = crypto_hmac_sha1_import, .descsize = sizeof(struct hmac_sha1_ctx), + .statesize = SHA1_SHASH_STATE_SIZE, }, }; -- cgit v1.2.3 From 70cb6ca58fddb02e269fe743ba75d53d577b5b1c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:58 -0700 Subject: lib/crypto: arm/sha1: Migrate optimized code into library Instead of exposing the arm-optimized SHA-1 code via arm-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be arm-optimized, and it fixes the longstanding issue where the arm-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha1_blocks(), change the type of the nblocks parameter of the assembly functions from int to size_t. The assembly functions actually already treated it as size_t. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/configs/exynos_defconfig | 1 - arch/arm/configs/milbeaut_m10v_defconfig | 2 - arch/arm/configs/multi_v7_defconfig | 2 - arch/arm/configs/omap2plus_defconfig | 1 - arch/arm/configs/pxa_defconfig | 1 - arch/arm/crypto/Kconfig | 31 -- arch/arm/crypto/Makefile | 6 - arch/arm/crypto/sha1-armv4-large.S | 507 ------------------------ arch/arm/crypto/sha1-armv7-neon.S | 634 ------------------------------- arch/arm/crypto/sha1-ce-core.S | 123 ------ arch/arm/crypto/sha1-ce-glue.c | 72 ---- arch/arm/crypto/sha1_glue.c | 75 ---- arch/arm/crypto/sha1_neon_glue.c | 83 ---- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 5 + lib/crypto/arm/sha1-armv4-large.S | 507 ++++++++++++++++++++++++ lib/crypto/arm/sha1-armv7-neon.S | 633 ++++++++++++++++++++++++++++++ lib/crypto/arm/sha1-ce-core.S | 123 ++++++ lib/crypto/arm/sha1.h | 46 +++ 19 files changed, 1315 insertions(+), 1538 deletions(-) delete mode 100644 arch/arm/crypto/sha1-armv4-large.S delete mode 100644 arch/arm/crypto/sha1-armv7-neon.S delete mode 100644 arch/arm/crypto/sha1-ce-core.S delete mode 100644 arch/arm/crypto/sha1-ce-glue.c delete mode 100644 arch/arm/crypto/sha1_glue.c delete mode 100644 arch/arm/crypto/sha1_neon_glue.c create mode 100644 lib/crypto/arm/sha1-armv4-large.S create mode 100644 lib/crypto/arm/sha1-armv7-neon.S create mode 100644 lib/crypto/arm/sha1-ce-core.S create mode 100644 lib/crypto/arm/sha1.h diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index d58e30069304..6915c766923a 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -363,7 +363,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA1_ARM_NEON=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index 8ebf8bd872fe..a3be0b2ede09 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -98,8 +98,6 @@ CONFIG_CRYPTO_SELFTESTS=y CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_SEQIV=m CONFIG_CRYPTO_GHASH_ARM_CE=m -CONFIG_CRYPTO_SHA1_ARM_NEON=m -CONFIG_CRYPTO_SHA1_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 3fd07e864ca8..fb63f487a623 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1280,8 +1280,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_GHASH_ARM_CE=m -CONFIG_CRYPTO_SHA1_ARM_NEON=m -CONFIG_CRYPTO_SHA1_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 530dfb8338c9..046467637901 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -704,7 +704,6 @@ CONFIG_NLS_ISO8859_1=y CONFIG_SECURITY=y CONFIG_CRYPTO_MICHAEL_MIC=y CONFIG_CRYPTO_GHASH_ARM_CE=m -CONFIG_CRYPTO_SHA1_ARM_NEON=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_CHACHA20_NEON=m diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index eaa44574d4a6..1a80602c1284 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -658,7 +658,6 @@ CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_DEFLATE=y CONFIG_CRYPTO_LZO=y -CONFIG_CRYPTO_SHA1_ARM=m CONFIG_CRYPTO_AES_ARM=m CONFIG_FONTS=y CONFIG_FONT_8x8=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index a18f97f1597c..1e5f3cdf691c 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -62,37 +62,6 @@ config CRYPTO_BLAKE2B_NEON much faster than the SHA-2 family and slightly faster than SHA-1. -config CRYPTO_SHA1_ARM - tristate "Hash functions: SHA-1" - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm - -config CRYPTO_SHA1_ARM_NEON - tristate "Hash functions: SHA-1 (NEON)" - depends on KERNEL_MODE_NEON - select CRYPTO_SHA1_ARM - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - -config CRYPTO_SHA1_ARM_CE - tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_SHA1_ARM - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm using ARMv8 Crypto Extensions - config CRYPTO_AES_ARM tristate "Ciphers: AES" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 78a4042d8761..4f23999ae17d 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -5,22 +5,16 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o -obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o -obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o -obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o -sha1-arm-y := sha1-armv4-large.o sha1_glue.o -sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o -sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S deleted file mode 100644 index 1c8b685149f2..000000000000 --- a/arch/arm/crypto/sha1-armv4-large.S +++ /dev/null @@ -1,507 +0,0 @@ -#define __ARM_ARCH__ __LINUX_ARM_ARCH__ -@ SPDX-License-Identifier: GPL-2.0 - -@ This code is taken from the OpenSSL project but the author (Andy Polyakov) -@ has relicensed it under the GPLv2. Therefore this program is free software; -@ you can redistribute it and/or modify it under the terms of the GNU General -@ Public License version 2 as published by the Free Software Foundation. -@ -@ The original headers, including the original license headers, are -@ included below for completeness. - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see https://www.openssl.org/~appro/cryptogams/. -@ ==================================================================== - -@ sha1_block procedure for ARMv4. -@ -@ January 2007. - -@ Size/performance trade-off -@ ==================================================================== -@ impl size in bytes comp cycles[*] measured performance -@ ==================================================================== -@ thumb 304 3212 4420 -@ armv4-small 392/+29% 1958/+64% 2250/+96% -@ armv4-compact 740/+89% 1552/+26% 1840/+22% -@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] -@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% -@ ==================================================================== -@ thumb = same as 'small' but in Thumb instructions[**] and -@ with recurring code in two private functions; -@ small = detached Xload/update, loops are folded; -@ compact = detached Xload/update, 5x unroll; -@ large = interleaved Xload/update, 5x unroll; -@ full unroll = interleaved Xload/update, full unroll, estimated[!]; -@ -@ [*] Manually counted instructions in "grand" loop body. Measured -@ performance is affected by prologue and epilogue overhead, -@ i-cache availability, branch penalties, etc. -@ [**] While each Thumb instruction is twice smaller, they are not as -@ diverse as ARM ones: e.g., there are only two arithmetic -@ instructions with 3 arguments, no [fixed] rotate, addressing -@ modes are limited. As result it takes more instructions to do -@ the same job in Thumb, therefore the code is never twice as -@ small and always slower. -@ [***] which is also ~35% better than compiler generated code. Dual- -@ issue Cortex A8 core was measured to process input block in -@ ~990 cycles. - -@ August 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 13% improvement on -@ Cortex A8 core and in absolute terms ~870 cycles per input block -@ [or 13.6 cycles per byte]. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 10% -@ improvement on Cortex A8 core and 12.2 cycles per byte. - -#include - -.text - -.align 2 -ENTRY(sha1_block_data_order) - stmdb sp!,{r4-r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -.Lloop: - ldr r8,.LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -.L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) - cmp r14,sp - bne .L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,.LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -.L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) - ARM( teq r14,sp ) @ preserve carry - THUMB( mov r11,sp ) - THUMB( teq r14,r11 ) @ preserve carry - bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,.LK_40_59 - sub sp,sp,#20*4 @ [+2] -.L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 - cmp r14,sp - bne .L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,.LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b .L_20_39_or_60_79 @ [+4], spare 300 bytes -.L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne .Lloop @ [+18], total 1307 - - ldmia sp!,{r4-r12,pc} -.align 2 -.LK_00_19: .word 0x5a827999 -.LK_20_39: .word 0x6ed9eba1 -.LK_40_59: .word 0x8f1bbcdc -.LK_60_79: .word 0xca62c1d6 -ENDPROC(sha1_block_data_order) -.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by " -.align 2 diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S deleted file mode 100644 index 28d816a6a530..000000000000 --- a/arch/arm/crypto/sha1-armv7-neon.S +++ /dev/null @@ -1,634 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function - * - * Copyright © 2013-2014 Jussi Kivilinna - */ - -#include -#include - -.syntax unified -.fpu neon - -.text - - -/* Context structure */ - -#define state_h0 0 -#define state_h1 4 -#define state_h2 8 -#define state_h3 12 -#define state_h4 16 - - -/* Constants */ - -#define K1 0x5A827999 -#define K2 0x6ED9EBA1 -#define K3 0x8F1BBCDC -#define K4 0xCA62C1D6 -.align 4 -.LK_VEC: -.LK1: .long K1, K1, K1, K1 -.LK2: .long K2, K2, K2, K2 -.LK3: .long K3, K3, K3, K3 -.LK4: .long K4, K4, K4, K4 - - -/* Register macros */ - -#define RSTATE r0 -#define RDATA r1 -#define RNBLKS r2 -#define ROLDSTACK r3 -#define RWK lr - -#define _a r4 -#define _b r5 -#define _c r6 -#define _d r7 -#define _e r8 - -#define RT0 r9 -#define RT1 r10 -#define RT2 r11 -#define RT3 r12 - -#define W0 q0 -#define W1 q7 -#define W2 q2 -#define W3 q3 -#define W4 q4 -#define W5 q6 -#define W6 q5 -#define W7 q1 - -#define tmp0 q8 -#define tmp1 q9 -#define tmp2 q10 -#define tmp3 q11 - -#define qK1 q12 -#define qK2 q13 -#define qK3 q14 -#define qK4 q15 - -#ifdef CONFIG_CPU_BIG_ENDIAN -#define ARM_LE(code...) -#else -#define ARM_LE(code...) code -#endif - -/* Round function macros. */ - -#define WK_offs(i) (((i) & 15) * 4) - -#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - bic RT0, d, b; \ - add e, e, a, ror #(32 - 5); \ - and RT1, c, b; \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add RT0, RT0, RT3; \ - add e, e, RT1; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT0; - -#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - eor RT0, d, b; \ - add e, e, a, ror #(32 - 5); \ - eor RT0, RT0, c; \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT3; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT0; \ - -#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - eor RT0, b, c; \ - and RT1, b, c; \ - add e, e, a, ror #(32 - 5); \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - and RT0, RT0, d; \ - add RT1, RT1, RT3; \ - add e, e, RT0; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT1; - -#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define R(a,b,c,d,e,f,i) \ - _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define dummy(...) - - -/* Input expansion macros. */ - -/********* Precalc macros for rounds 0-15 *************************************/ - -#define W_PRECALC_00_15() \ - add RWK, sp, #(WK_offs(0)); \ - \ - vld1.32 {W0, W7}, [RDATA]!; \ - ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ - vld1.32 {W6, W5}, [RDATA]!; \ - vadd.u32 tmp0, W0, curK; \ - ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ - ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ - vadd.u32 tmp1, W7, curK; \ - ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ - vadd.u32 tmp2, W6, curK; \ - vst1.32 {tmp0, tmp1}, [RWK]!; \ - vadd.u32 tmp3, W5, curK; \ - vst1.32 {tmp2, tmp3}, [RWK]; \ - -#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vld1.32 {W0, W7}, [RDATA]!; \ - -#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(0)); \ - -#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ - -#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vld1.32 {W6, W5}, [RDATA]!; \ - -#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W0, curK; \ - -#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ - -#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ - -#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp1, W7, curK; \ - -#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ - -#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp2, W6, curK; \ - -#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0, tmp1}, [RWK]!; \ - -#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp3, W5, curK; \ - -#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp2, tmp3}, [RWK]; \ - - -/********* Precalc macros for rounds 16-31 ************************************/ - -#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0; \ - vext.8 W, W_m16, W_m12, #8; \ - -#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(i)); \ - vext.8 tmp0, W_m04, tmp0, #4; \ - -#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0, W_m16; \ - veor.32 W, W, W_m08; \ - -#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp1, tmp1; \ - veor W, W, tmp0; \ - -#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp0, W, #1; \ - -#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vext.8 tmp1, tmp1, W, #(16-12); \ - vshr.u32 W, W, #31; \ - -#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vorr tmp0, tmp0, W; \ - vshr.u32 W, tmp1, #30; \ - -#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp1, tmp1, #2; \ - -#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0, W; \ - -#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, tmp0, tmp1; \ - -#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W, curK; \ - -#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0}, [RWK]; - - -/********* Precalc macros for rounds 32-79 ************************************/ - -#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, W_m28; \ - -#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vext.8 tmp0, W_m08, W_m04, #8; \ - -#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, W_m16; \ - -#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, tmp0; \ - -#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(i&~3)); \ - -#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp1, W, #2; \ - -#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshr.u32 tmp0, W, #30; \ - -#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vorr W, tmp0, tmp1; \ - -#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W, curK; \ - -#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0}, [RWK]; - - -/* - * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. - * - * unsigned int - * sha1_transform_neon (void *ctx, const unsigned char *data, - * unsigned int nblks) - */ -.align 3 -ENTRY(sha1_transform_neon) - /* input: - * r0: ctx, CTX - * r1: data (64*nblks bytes) - * r2: nblks - */ - - cmp RNBLKS, #0; - beq .Ldo_nothing; - - push {r4-r12, lr}; - /*vpush {q4-q7};*/ - - adr RT3, .LK_VEC; - - mov ROLDSTACK, sp; - - /* Align stack. */ - sub RT0, sp, #(16*4); - and RT0, #(~(16-1)); - mov sp, RT0; - - vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ - - /* Get the values of the chaining variables. */ - ldm RSTATE, {_a-_e}; - - vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ - -#undef curK -#define curK qK1 - /* Precalc 0-15. */ - W_PRECALC_00_15(); - -.Loop: - /* Transform 0-15 + Precalc 16-31. */ - _R( _a, _b, _c, _d, _e, F1, 0, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 1, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _d, _e, _a, _b, _c, F1, 2, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _c, _d, _e, _a, _b, F1, 3, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, - W4, W5, W6, W7, W0, _, _, _ ); - -#undef curK -#define curK qK2 - _R( _b, _c, _d, _e, _a, F1, 4, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 5, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 6, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _d, _e, _a, _b, _c, F1, 7, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, - W3, W4, W5, W6, W7, _, _, _ ); - - _R( _c, _d, _e, _a, _b, F1, 8, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _b, _c, _d, _e, _a, F1, 9, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 10, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 11, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, - W2, W3, W4, W5, W6, _, _, _ ); - - _R( _d, _e, _a, _b, _c, F1, 12, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _c, _d, _e, _a, _b, F1, 13, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _b, _c, _d, _e, _a, F1, 14, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 15, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, - W1, W2, W3, W4, W5, _, _, _ ); - - /* Transform 16-63 + Precalc 32-79. */ - _R( _e, _a, _b, _c, _d, F1, 16, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _d, _e, _a, _b, _c, F1, 17, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _c, _d, _e, _a, _b, F1, 18, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _b, _c, _d, _e, _a, F1, 19, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - - _R( _a, _b, _c, _d, _e, F2, 20, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _e, _a, _b, _c, _d, F2, 21, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _d, _e, _a, _b, _c, F2, 22, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _c, _d, _e, _a, _b, F2, 23, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - -#undef curK -#define curK qK3 - _R( _b, _c, _d, _e, _a, F2, 24, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _a, _b, _c, _d, _e, F2, 25, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _e, _a, _b, _c, _d, F2, 26, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _d, _e, _a, _b, _c, F2, 27, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - - _R( _c, _d, _e, _a, _b, F2, 28, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _b, _c, _d, _e, _a, F2, 29, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _a, _b, _c, _d, _e, F2, 30, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _e, _a, _b, _c, _d, F2, 31, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - - _R( _d, _e, _a, _b, _c, F2, 32, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _c, _d, _e, _a, _b, F2, 33, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _b, _c, _d, _e, _a, F2, 34, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _a, _b, _c, _d, _e, F2, 35, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - - _R( _e, _a, _b, _c, _d, F2, 36, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _d, _e, _a, _b, _c, F2, 37, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _c, _d, _e, _a, _b, F2, 38, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _b, _c, _d, _e, _a, F2, 39, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - - _R( _a, _b, _c, _d, _e, F3, 40, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _e, _a, _b, _c, _d, F3, 41, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _d, _e, _a, _b, _c, F3, 42, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _c, _d, _e, _a, _b, F3, 43, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - -#undef curK -#define curK qK4 - _R( _b, _c, _d, _e, _a, F3, 44, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _a, _b, _c, _d, _e, F3, 45, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _e, _a, _b, _c, _d, F3, 46, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _d, _e, _a, _b, _c, F3, 47, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - - _R( _c, _d, _e, _a, _b, F3, 48, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _b, _c, _d, _e, _a, F3, 49, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _a, _b, _c, _d, _e, F3, 50, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _e, _a, _b, _c, _d, F3, 51, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - - _R( _d, _e, _a, _b, _c, F3, 52, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _c, _d, _e, _a, _b, F3, 53, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _b, _c, _d, _e, _a, F3, 54, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _a, _b, _c, _d, _e, F3, 55, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - - _R( _e, _a, _b, _c, _d, F3, 56, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _d, _e, _a, _b, _c, F3, 57, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _c, _d, _e, _a, _b, F3, 58, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _b, _c, _d, _e, _a, F3, 59, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - - subs RNBLKS, #1; - - _R( _a, _b, _c, _d, _e, F4, 60, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _e, _a, _b, _c, _d, F4, 61, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _d, _e, _a, _b, _c, F4, 62, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _c, _d, _e, _a, _b, F4, 63, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - - beq .Lend; - - /* Transform 64-79 + Precalc 0-15 of next block. */ -#undef curK -#define curK qK1 - _R( _b, _c, _d, _e, _a, F4, 64, - WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 65, - WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _e, _a, _b, _c, _d, F4, 66, - WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _d, _e, _a, _b, _c, F4, 67, - WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _c, _d, _e, _a, _b, F4, 68, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 69, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 70, - WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _e, _a, _b, _c, _d, F4, 71, - WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _d, _e, _a, _b, _c, F4, 72, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _c, _d, _e, _a, _b, F4, 73, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 74, - WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 75, - WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _e, _a, _b, _c, _d, F4, 76, - WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _d, _e, _a, _b, _c, F4, 77, - WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _c, _d, _e, _a, _b, F4, 78, - WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 79, - WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); - - /* Update the chaining variables. */ - ldm RSTATE, {RT0-RT3}; - add _a, RT0; - ldr RT0, [RSTATE, #state_h4]; - add _b, RT1; - add _c, RT2; - add _d, RT3; - add _e, RT0; - stm RSTATE, {_a-_e}; - - b .Loop; - -.Lend: - /* Transform 64-79 */ - R( _b, _c, _d, _e, _a, F4, 64 ); - R( _a, _b, _c, _d, _e, F4, 65 ); - R( _e, _a, _b, _c, _d, F4, 66 ); - R( _d, _e, _a, _b, _c, F4, 67 ); - R( _c, _d, _e, _a, _b, F4, 68 ); - R( _b, _c, _d, _e, _a, F4, 69 ); - R( _a, _b, _c, _d, _e, F4, 70 ); - R( _e, _a, _b, _c, _d, F4, 71 ); - R( _d, _e, _a, _b, _c, F4, 72 ); - R( _c, _d, _e, _a, _b, F4, 73 ); - R( _b, _c, _d, _e, _a, F4, 74 ); - R( _a, _b, _c, _d, _e, F4, 75 ); - R( _e, _a, _b, _c, _d, F4, 76 ); - R( _d, _e, _a, _b, _c, F4, 77 ); - R( _c, _d, _e, _a, _b, F4, 78 ); - R( _b, _c, _d, _e, _a, F4, 79 ); - - mov sp, ROLDSTACK; - - /* Update the chaining variables. */ - ldm RSTATE, {RT0-RT3}; - add _a, RT0; - ldr RT0, [RSTATE, #state_h4]; - add _b, RT1; - add _c, RT2; - add _d, RT3; - /*vpop {q4-q7};*/ - add _e, RT0; - stm RSTATE, {_a-_e}; - - pop {r4-r12, pc}; - -.Ldo_nothing: - bx lr -ENDPROC(sha1_transform_neon) diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S deleted file mode 100644 index 8a702e051738..000000000000 --- a/arch/arm/crypto/sha1-ce-core.S +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd. - * Author: Ard Biesheuvel - */ - -#include -#include - - .text - .arch armv8-a - .fpu crypto-neon-fp-armv8 - - k0 .req q0 - k1 .req q1 - k2 .req q2 - k3 .req q3 - - ta0 .req q4 - ta1 .req q5 - tb0 .req q5 - tb1 .req q4 - - dga .req q6 - dgb .req q7 - dgbs .req s28 - - dg0 .req q12 - dg1a0 .req q13 - dg1a1 .req q14 - dg1b0 .req q14 - dg1b1 .req q13 - - .macro add_only, op, ev, rc, s0, dg1 - .ifnb \s0 - vadd.u32 tb\ev, q\s0, \rc - .endif - sha1h.32 dg1b\ev, dg0 - .ifb \dg1 - sha1\op\().32 dg0, dg1a\ev, ta\ev - .else - sha1\op\().32 dg0, \dg1, ta\ev - .endif - .endm - - .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 - sha1su0.32 q\s0, q\s1, q\s2 - add_only \op, \ev, \rc, \s1, \dg1 - sha1su1.32 q\s0, q\s3 - .endm - - .align 6 -.Lsha1_rcon: - .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 - .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 - .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc - .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 - - /* - * void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - * int blocks); - */ -ENTRY(sha1_ce_transform) - /* load round constants */ - adr ip, .Lsha1_rcon - vld1.32 {k0-k1}, [ip, :128]! - vld1.32 {k2-k3}, [ip, :128] - - /* load state */ - vld1.32 {dga}, [r0] - vldr dgbs, [r0, #16] - - /* load input */ -0: vld1.32 {q8-q9}, [r1]! - vld1.32 {q10-q11}, [r1]! - subs r2, r2, #1 - -#ifndef CONFIG_CPU_BIG_ENDIAN - vrev32.8 q8, q8 - vrev32.8 q9, q9 - vrev32.8 q10, q10 - vrev32.8 q11, q11 -#endif - - vadd.u32 ta0, q8, k0 - vmov dg0, dga - - add_update c, 0, k0, 8, 9, 10, 11, dgb - add_update c, 1, k0, 9, 10, 11, 8 - add_update c, 0, k0, 10, 11, 8, 9 - add_update c, 1, k0, 11, 8, 9, 10 - add_update c, 0, k1, 8, 9, 10, 11 - - add_update p, 1, k1, 9, 10, 11, 8 - add_update p, 0, k1, 10, 11, 8, 9 - add_update p, 1, k1, 11, 8, 9, 10 - add_update p, 0, k1, 8, 9, 10, 11 - add_update p, 1, k2, 9, 10, 11, 8 - - add_update m, 0, k2, 10, 11, 8, 9 - add_update m, 1, k2, 11, 8, 9, 10 - add_update m, 0, k2, 8, 9, 10, 11 - add_update m, 1, k2, 9, 10, 11, 8 - add_update m, 0, k3, 10, 11, 8, 9 - - add_update p, 1, k3, 11, 8, 9, 10 - add_only p, 0, k3, 9 - add_only p, 1, k3, 10 - add_only p, 0, k3, 11 - add_only p, 1 - - /* update state */ - vadd.u32 dga, dga, dg0 - vadd.u32 dgb, dgb, dg1a0 - bne 0b - - /* store new state */ - vst1.32 {dga}, [r0] - vstr dgbs, [r0, #16] - bx lr -ENDPROC(sha1_ce_transform) diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c deleted file mode 100644 index fac07a4799de..000000000000 --- a/arch/arm/crypto/sha1-ce-glue.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); - -asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - int blocks); - -static int sha1_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - int remain; - - kernel_neon_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); - kernel_neon_end(); - - return remain; -} - -static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - kernel_neon_begin(); - sha1_base_do_finup(desc, data, len, sha1_ce_transform); - kernel_neon_end(); - - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .init = sha1_base_init, - .update = sha1_ce_update, - .finup = sha1_ce_finup, - .descsize = SHA1_STATE_SIZE, - .digestsize = SHA1_DIGEST_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_ce_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_ce_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(SHA1, sha1_ce_mod_init); -module_exit(sha1_ce_mod_fini); diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c deleted file mode 100644 index 255da00c7d98..000000000000 --- a/arch/arm/crypto/sha1_glue.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * Glue code for the SHA1 Secure Hash Algorithm assembler implementation - * - * This file is based on sha1_generic.c and sha1_ssse3_glue.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - */ - -#include -#include -#include -#include -#include - -asmlinkage void sha1_block_data_order(struct sha1_state *digest, - const u8 *data, int rounds); - -static int sha1_update_arm(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - /* make sure signature matches sha1_block_fn() */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - return sha1_base_do_update_blocks(desc, data, len, - sha1_block_data_order); -} - -static int sha1_finup_arm(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha1_base_do_finup(desc, data, len, sha1_block_data_order); - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_update_arm, - .finup = sha1_finup_arm, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-asm", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - - -static int __init sha1_mod_init(void) -{ - return crypto_register_shash(&alg); -} - - -static void __exit sha1_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - - -module_init(sha1_mod_init); -module_exit(sha1_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)"); -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_AUTHOR("David McCullough "); diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c deleted file mode 100644 index d321850f22a6..000000000000 --- a/arch/arm/crypto/sha1_neon_glue.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using - * ARM NEON instructions. - * - * Copyright © 2014 Jussi Kivilinna - * - * This file is based on sha1_generic.c and sha1_ssse3_glue.c: - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - * Copyright (c) Chandramouli Narayanan - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void sha1_transform_neon(struct sha1_state *state_h, - const u8 *data, int rounds); - -static int sha1_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - int remain; - - kernel_neon_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, - sha1_transform_neon); - kernel_neon_end(); - - return remain; -} - -static int sha1_neon_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - kernel_neon_begin(); - sha1_base_do_finup(desc, data, len, sha1_transform_neon); - kernel_neon_end(); - - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_neon_update, - .finup = sha1_neon_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-neon", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_neon_mod_init(void) -{ - if (!cpu_has_neon()) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit sha1_neon_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(sha1_neon_mod_init); -module_exit(sha1_neon_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated"); -MODULE_ALIAS_CRYPTO("sha1"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 5aaf484fc9de..519c5d6a050f 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -146,6 +146,7 @@ config CRYPTO_LIB_SHA1 config CRYPTO_LIB_SHA1_ARCH bool depends on CRYPTO_LIB_SHA1 && !UML + default y if ARM config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 0eb0906d693f..699a42133927 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -71,6 +71,11 @@ obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o libsha1-y := sha1.o ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y) CFLAGS_sha1.o += -I$(src)/$(SRCARCH) +ifeq ($(CONFIG_ARM),y) +libsha1-y += arm/sha1-armv4-large.o +libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \ + arm/sha1-ce-core.o +endif endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S new file mode 100644 index 000000000000..1c8b685149f2 --- /dev/null +++ b/lib/crypto/arm/sha1-armv4-large.S @@ -0,0 +1,507 @@ +#define __ARM_ARCH__ __LINUX_ARM_ARCH__ +@ SPDX-License-Identifier: GPL-2.0 + +@ This code is taken from the OpenSSL project but the author (Andy Polyakov) +@ has relicensed it under the GPLv2. Therefore this program is free software; +@ you can redistribute it and/or modify it under the terms of the GNU General +@ Public License version 2 as published by the Free Software Foundation. +@ +@ The original headers, including the original license headers, are +@ included below for completeness. + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see https://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ sha1_block procedure for ARMv4. +@ +@ January 2007. + +@ Size/performance trade-off +@ ==================================================================== +@ impl size in bytes comp cycles[*] measured performance +@ ==================================================================== +@ thumb 304 3212 4420 +@ armv4-small 392/+29% 1958/+64% 2250/+96% +@ armv4-compact 740/+89% 1552/+26% 1840/+22% +@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] +@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% +@ ==================================================================== +@ thumb = same as 'small' but in Thumb instructions[**] and +@ with recurring code in two private functions; +@ small = detached Xload/update, loops are folded; +@ compact = detached Xload/update, 5x unroll; +@ large = interleaved Xload/update, 5x unroll; +@ full unroll = interleaved Xload/update, full unroll, estimated[!]; +@ +@ [*] Manually counted instructions in "grand" loop body. Measured +@ performance is affected by prologue and epilogue overhead, +@ i-cache availability, branch penalties, etc. +@ [**] While each Thumb instruction is twice smaller, they are not as +@ diverse as ARM ones: e.g., there are only two arithmetic +@ instructions with 3 arguments, no [fixed] rotate, addressing +@ modes are limited. As result it takes more instructions to do +@ the same job in Thumb, therefore the code is never twice as +@ small and always slower. +@ [***] which is also ~35% better than compiler generated code. Dual- +@ issue Cortex A8 core was measured to process input block in +@ ~990 cycles. + +@ August 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 13% improvement on +@ Cortex A8 core and in absolute terms ~870 cycles per input block +@ [or 13.6 cycles per byte]. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 10% +@ improvement on Cortex A8 core and 12.2 cycles per byte. + +#include + +.text + +.align 2 +ENTRY(sha1_block_data_order) + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + cmp r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + ARM( teq r14,sp ) @ preserve carry + THUMB( mov r11,sp ) + THUMB( teq r14,r11 ) @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + cmp r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + + ldmia sp!,{r4-r12,pc} +.align 2 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +ENDPROC(sha1_block_data_order) +.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by " +.align 2 diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S new file mode 100644 index 000000000000..6edba3ab62e8 --- /dev/null +++ b/lib/crypto/arm/sha1-armv7-neon.S @@ -0,0 +1,633 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function + * + * Copyright © 2013-2014 Jussi Kivilinna + */ + +#include +#include + +.syntax unified +.fpu neon + +.text + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 4 +.LK_VEC: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + + +/* Register macros */ + +#define RSTATE r0 +#define RDATA r1 +#define RNBLKS r2 +#define ROLDSTACK r3 +#define RWK lr + +#define _a r4 +#define _b r5 +#define _c r6 +#define _d r7 +#define _e r8 + +#define RT0 r9 +#define RT1 r10 +#define RT2 r11 +#define RT3 r12 + +#define W0 q0 +#define W1 q7 +#define W2 q2 +#define W3 q3 +#define W4 q4 +#define W5 q6 +#define W6 q5 +#define W7 q1 + +#define tmp0 q8 +#define tmp1 q9 +#define tmp2 q10 +#define tmp3 q11 + +#define qK1 q12 +#define qK2 q13 +#define qK3 q14 +#define qK4 q15 + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define ARM_LE(code...) +#else +#define ARM_LE(code...) code +#endif + +/* Round function macros. */ + +#define WK_offs(i) (((i) & 15) * 4) + +#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + bic RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + and RT1, c, b; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add RT0, RT0, RT3; \ + add e, e, RT1; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; + +#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + eor RT0, RT0, c; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT3; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; \ + +#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, b, c; \ + and RT1, b, c; \ + add e, e, a, ror #(32 - 5); \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + and RT0, RT0, d; \ + add RT1, RT1, RT3; \ + add e, e, RT0; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT1; + +#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define R(a,b,c,d,e,f,i) \ + _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define dummy(...) + + +/* Input expansion macros. */ + +/********* Precalc macros for rounds 0-15 *************************************/ + +#define W_PRECALC_00_15() \ + add RWK, sp, #(WK_offs(0)); \ + \ + vld1.32 {W0, W7}, [RDATA]!; \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + vld1.32 {W6, W5}, [RDATA]!; \ + vadd.u32 tmp0, W0, curK; \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + vadd.u32 tmp1, W7, curK; \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + vadd.u32 tmp2, W6, curK; \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + vadd.u32 tmp3, W5, curK; \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + +#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W0, W7}, [RDATA]!; \ + +#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(0)); \ + +#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + +#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W6, W5}, [RDATA]!; \ + +#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W0, curK; \ + +#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + +#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + +#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp1, W7, curK; \ + +#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + +#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp2, W6, curK; \ + +#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + +#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp3, W5, curK; \ + +#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + + +/********* Precalc macros for rounds 16-31 ************************************/ + +#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0; \ + vext.8 W, W_m16, W_m12, #8; \ + +#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i)); \ + vext.8 tmp0, W_m04, tmp0, #4; \ + +#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W_m16; \ + veor.32 W, W, W_m08; \ + +#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp1, tmp1; \ + veor W, W, tmp0; \ + +#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp0, W, #1; \ + +#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp1, tmp1, W, #(16-12); \ + vshr.u32 W, W, #31; \ + +#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr tmp0, tmp0, W; \ + vshr.u32 W, tmp1, #30; \ + +#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, tmp1, #2; \ + +#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W; \ + +#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0, tmp1; \ + +#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/********* Precalc macros for rounds 32-79 ************************************/ + +#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m28; \ + +#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp0, W_m08, W_m04, #8; \ + +#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m16; \ + +#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0; \ + +#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i&~3)); \ + +#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, W, #2; \ + +#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshr.u32 tmp0, W, #30; \ + +#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr W, tmp0, tmp1; \ + +#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/* + * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. + * + * void sha1_transform_neon(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +.align 3 +ENTRY(sha1_transform_neon) + /* input: + * r0: state + * r1: data (64*nblocks bytes) + * r2: nblocks + */ + + cmp RNBLKS, #0; + beq .Ldo_nothing; + + push {r4-r12, lr}; + /*vpush {q4-q7};*/ + + adr RT3, .LK_VEC; + + mov ROLDSTACK, sp; + + /* Align stack. */ + sub RT0, sp, #(16*4); + and RT0, #(~(16-1)); + mov sp, RT0; + + vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ + + /* Get the values of the chaining variables. */ + ldm RSTATE, {_a-_e}; + + vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ + +#undef curK +#define curK qK1 + /* Precalc 0-15. */ + W_PRECALC_00_15(); + +.Loop: + /* Transform 0-15 + Precalc 16-31. */ + _R( _a, _b, _c, _d, _e, F1, 0, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 1, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 2, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 3, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, + W4, W5, W6, W7, W0, _, _, _ ); + +#undef curK +#define curK qK2 + _R( _b, _c, _d, _e, _a, F1, 4, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 5, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 6, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 7, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, + W3, W4, W5, W6, W7, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F1, 8, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 9, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 10, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 11, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, + W2, W3, W4, W5, W6, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F1, 12, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 13, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 14, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 15, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, + W1, W2, W3, W4, W5, _, _, _ ); + + /* Transform 16-63 + Precalc 32-79. */ + _R( _e, _a, _b, _c, _d, F1, 16, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _d, _e, _a, _b, _c, F1, 17, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _c, _d, _e, _a, _b, F1, 18, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F1, 19, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _a, _b, _c, _d, _e, F2, 20, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _e, _a, _b, _c, _d, F2, 21, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _d, _e, _a, _b, _c, F2, 22, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F2, 23, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + +#undef curK +#define curK qK3 + _R( _b, _c, _d, _e, _a, F2, 24, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _a, _b, _c, _d, _e, F2, 25, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _e, _a, _b, _c, _d, F2, 26, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F2, 27, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + + _R( _c, _d, _e, _a, _b, F2, 28, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _b, _c, _d, _e, _a, F2, 29, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _a, _b, _c, _d, _e, F2, 30, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F2, 31, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + + _R( _d, _e, _a, _b, _c, F2, 32, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _c, _d, _e, _a, _b, F2, 33, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _b, _c, _d, _e, _a, F2, 34, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _a, _b, _c, _d, _e, F2, 35, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + + _R( _e, _a, _b, _c, _d, F2, 36, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _d, _e, _a, _b, _c, F2, 37, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _c, _d, _e, _a, _b, F2, 38, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _b, _c, _d, _e, _a, F2, 39, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + + _R( _a, _b, _c, _d, _e, F3, 40, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _e, _a, _b, _c, _d, F3, 41, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _d, _e, _a, _b, _c, F3, 42, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _c, _d, _e, _a, _b, F3, 43, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + +#undef curK +#define curK qK4 + _R( _b, _c, _d, _e, _a, F3, 44, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _a, _b, _c, _d, _e, F3, 45, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _e, _a, _b, _c, _d, F3, 46, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _d, _e, _a, _b, _c, F3, 47, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + + _R( _c, _d, _e, _a, _b, F3, 48, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F3, 49, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _a, _b, _c, _d, _e, F3, 50, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _e, _a, _b, _c, _d, F3, 51, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _d, _e, _a, _b, _c, F3, 52, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F3, 53, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _b, _c, _d, _e, _a, F3, 54, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _a, _b, _c, _d, _e, F3, 55, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + + _R( _e, _a, _b, _c, _d, F3, 56, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F3, 57, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _c, _d, _e, _a, _b, F3, 58, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _b, _c, _d, _e, _a, F3, 59, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + + subs RNBLKS, #1; + + _R( _a, _b, _c, _d, _e, F4, 60, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F4, 61, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _d, _e, _a, _b, _c, F4, 62, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _c, _d, _e, _a, _b, F4, 63, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + + beq .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ +#undef curK +#define curK qK1 + _R( _b, _c, _d, _e, _a, F4, 64, + WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 65, + WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 66, + WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 67, + WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F4, 68, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 69, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 70, + WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 71, + WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F4, 72, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 73, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 74, + WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 75, + WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _e, _a, _b, _c, _d, F4, 76, + WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 77, + WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 78, + WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 79, + WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + add _e, RT0; + stm RSTATE, {_a-_e}; + + b .Loop; + +.Lend: + /* Transform 64-79 */ + R( _b, _c, _d, _e, _a, F4, 64 ); + R( _a, _b, _c, _d, _e, F4, 65 ); + R( _e, _a, _b, _c, _d, F4, 66 ); + R( _d, _e, _a, _b, _c, F4, 67 ); + R( _c, _d, _e, _a, _b, F4, 68 ); + R( _b, _c, _d, _e, _a, F4, 69 ); + R( _a, _b, _c, _d, _e, F4, 70 ); + R( _e, _a, _b, _c, _d, F4, 71 ); + R( _d, _e, _a, _b, _c, F4, 72 ); + R( _c, _d, _e, _a, _b, F4, 73 ); + R( _b, _c, _d, _e, _a, F4, 74 ); + R( _a, _b, _c, _d, _e, F4, 75 ); + R( _e, _a, _b, _c, _d, F4, 76 ); + R( _d, _e, _a, _b, _c, F4, 77 ); + R( _c, _d, _e, _a, _b, F4, 78 ); + R( _b, _c, _d, _e, _a, F4, 79 ); + + mov sp, ROLDSTACK; + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + /*vpop {q4-q7};*/ + add _e, RT0; + stm RSTATE, {_a-_e}; + + pop {r4-r12, pc}; + +.Ldo_nothing: + bx lr +ENDPROC(sha1_transform_neon) diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S new file mode 100644 index 000000000000..2de40dd25e47 --- /dev/null +++ b/lib/crypto/arm/sha1-ce-core.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel + */ + +#include +#include + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q0 + k1 .req q1 + k2 .req q2 + k3 .req q3 + + ta0 .req q4 + ta1 .req q5 + tb0 .req q5 + tb1 .req q4 + + dga .req q6 + dgb .req q7 + dgbs .req s28 + + dg0 .req q12 + dg1a0 .req q13 + dg1a1 .req q14 + dg1b0 .req q14 + dg1b1 .req q13 + + .macro add_only, op, ev, rc, s0, dg1 + .ifnb \s0 + vadd.u32 tb\ev, q\s0, \rc + .endif + sha1h.32 dg1b\ev, dg0 + .ifb \dg1 + sha1\op\().32 dg0, dg1a\ev, ta\ev + .else + sha1\op\().32 dg0, \dg1, ta\ev + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0.32 q\s0, q\s1, q\s2 + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1.32 q\s0, q\s3 + .endm + + .align 6 +.Lsha1_rcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + + /* + * void sha1_ce_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +ENTRY(sha1_ce_transform) + /* load round constants */ + adr ip, .Lsha1_rcon + vld1.32 {k0-k1}, [ip, :128]! + vld1.32 {k2-k3}, [ip, :128] + + /* load state */ + vld1.32 {dga}, [r0] + vldr dgbs, [r0, #16] + + /* load input */ +0: vld1.32 {q8-q9}, [r1]! + vld1.32 {q10-q11}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 +#endif + + vadd.u32 ta0, q8, k0 + vmov dg0, dga + + add_update c, 0, k0, 8, 9, 10, 11, dgb + add_update c, 1, k0, 9, 10, 11, 8 + add_update c, 0, k0, 10, 11, 8, 9 + add_update c, 1, k0, 11, 8, 9, 10 + add_update c, 0, k1, 8, 9, 10, 11 + + add_update p, 1, k1, 9, 10, 11, 8 + add_update p, 0, k1, 10, 11, 8, 9 + add_update p, 1, k1, 11, 8, 9, 10 + add_update p, 0, k1, 8, 9, 10, 11 + add_update p, 1, k2, 9, 10, 11, 8 + + add_update m, 0, k2, 10, 11, 8, 9 + add_update m, 1, k2, 11, 8, 9, 10 + add_update m, 0, k2, 8, 9, 10, 11 + add_update m, 1, k2, 9, 10, 11, 8 + add_update m, 0, k3, 10, 11, 8, 9 + + add_update p, 1, k3, 11, 8, 9, 10 + add_only p, 0, k3, 9 + add_only p, 1, k3, 10 + add_only p, 0, k3, 11 + add_only p, 1 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1a0 + bne 0b + + /* store new state */ + vst1.32 {dga}, [r0] + vstr dgbs, [r0, #16] + bx lr +ENDPROC(sha1_ce_transform) diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h new file mode 100644 index 000000000000..fa1e92419000 --- /dev/null +++ b/lib/crypto/arm/sha1.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage void sha1_block_data_order(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_transform_neon(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_ce_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha1_ce_transform(state, data, nblocks); + else + sha1_transform_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha1_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA1) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ -- cgit v1.2.3 From 00d549bb89e471b7df550459fcb51ffbded39cbf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:59 -0700 Subject: lib/crypto: arm64/sha1: Migrate optimized code into library Instead of exposing the arm64-optimized SHA-1 code via arm64-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be arm64-optimized, and it fixes the longstanding issue where the arm64-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. Remove support for SHA-1 finalization from assembly code, since the library does not yet support architecture-specific overrides of the finalization. (Support for that has been omitted for now, for simplicity and because usually it isn't performance-critical.) To match sha1_blocks(), change the type of the nblocks parameter and the return value of __sha1_ce_transform() from int to size_t. Update the assembly code accordingly. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/configs/defconfig | 1 - arch/arm64/crypto/Kconfig | 11 --- arch/arm64/crypto/Makefile | 3 - arch/arm64/crypto/sha1-ce-core.S | 150 --------------------------------------- arch/arm64/crypto/sha1-ce-glue.c | 118 ------------------------------ lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + lib/crypto/arm64/sha1-ce-core.S | 130 +++++++++++++++++++++++++++++++++ lib/crypto/arm64/sha1.h | 39 ++++++++++ 9 files changed, 171 insertions(+), 283 deletions(-) delete mode 100644 arch/arm64/crypto/sha1-ce-core.S delete mode 100644 arch/arm64/crypto/sha1-ce-glue.c create mode 100644 lib/crypto/arm64/sha1-ce-core.S create mode 100644 lib/crypto/arm64/sha1.h diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index b612b78b3b09..31681206b49c 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1743,7 +1743,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_GHASH_ARM64_CE=y -CONFIG_CRYPTO_SHA1_ARM64_CE=y CONFIG_CRYPTO_SHA3_ARM64=m CONFIG_CRYPTO_SM3_ARM64_CE=m CONFIG_CRYPTO_AES_ARM64_CE_BLK=y diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index a9ead99f72c2..3bb5b513d5ae 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm64 using: - NEON (Advanced SIMD) extensions -config CRYPTO_SHA1_ARM64_CE - tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA1 - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - config CRYPTO_SHA3_ARM64 tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 228101f125d5..a8b2cdbe202c 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -5,9 +5,6 @@ # Copyright (C) 2014 Linaro Ltd # -obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o -sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o - obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S deleted file mode 100644 index 9b1f2d82a6fe..000000000000 --- a/arch/arm64/crypto/sha1-ce-core.S +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2014 Linaro Ltd - */ - -#include -#include - - .text - .arch armv8-a+crypto - - k0 .req v0 - k1 .req v1 - k2 .req v2 - k3 .req v3 - - t0 .req v4 - t1 .req v5 - - dga .req q6 - dgav .req v6 - dgb .req s7 - dgbv .req v7 - - dg0q .req q12 - dg0s .req s12 - dg0v .req v12 - dg1s .req s13 - dg1v .req v13 - dg2s .req s14 - - .macro add_only, op, ev, rc, s0, dg1 - .ifc \ev, ev - add t1.4s, v\s0\().4s, \rc\().4s - sha1h dg2s, dg0s - .ifnb \dg1 - sha1\op dg0q, \dg1, t0.4s - .else - sha1\op dg0q, dg1s, t0.4s - .endif - .else - .ifnb \s0 - add t0.4s, v\s0\().4s, \rc\().4s - .endif - sha1h dg1s, dg0s - sha1\op dg0q, dg2s, t1.4s - .endif - .endm - - .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 - sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s - add_only \op, \ev, \rc, \s1, \dg1 - sha1su1 v\s0\().4s, v\s3\().4s - .endm - - .macro loadrc, k, val, tmp - movz \tmp, :abs_g0_nc:\val - movk \tmp, :abs_g1:\val - dup \k, \tmp - .endm - - /* - * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) - */ -SYM_FUNC_START(__sha1_ce_transform) - /* load round constants */ - loadrc k0.4s, 0x5a827999, w6 - loadrc k1.4s, 0x6ed9eba1, w6 - loadrc k2.4s, 0x8f1bbcdc, w6 - loadrc k3.4s, 0xca62c1d6, w6 - - /* load state */ - ld1 {dgav.4s}, [x0] - ldr dgb, [x0, #16] - - /* load sha1_ce_state::finalize */ - ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x0, x4] - - /* load input */ -0: ld1 {v8.4s-v11.4s}, [x1], #64 - sub w2, w2, #1 - -CPU_LE( rev32 v8.16b, v8.16b ) -CPU_LE( rev32 v9.16b, v9.16b ) -CPU_LE( rev32 v10.16b, v10.16b ) -CPU_LE( rev32 v11.16b, v11.16b ) - -1: add t0.4s, v8.4s, k0.4s - mov dg0v.16b, dgav.16b - - add_update c, ev, k0, 8, 9, 10, 11, dgb - add_update c, od, k0, 9, 10, 11, 8 - add_update c, ev, k0, 10, 11, 8, 9 - add_update c, od, k0, 11, 8, 9, 10 - add_update c, ev, k1, 8, 9, 10, 11 - - add_update p, od, k1, 9, 10, 11, 8 - add_update p, ev, k1, 10, 11, 8, 9 - add_update p, od, k1, 11, 8, 9, 10 - add_update p, ev, k1, 8, 9, 10, 11 - add_update p, od, k2, 9, 10, 11, 8 - - add_update m, ev, k2, 10, 11, 8, 9 - add_update m, od, k2, 11, 8, 9, 10 - add_update m, ev, k2, 8, 9, 10, 11 - add_update m, od, k2, 9, 10, 11, 8 - add_update m, ev, k3, 10, 11, 8, 9 - - add_update p, od, k3, 11, 8, 9, 10 - add_only p, ev, k3, 9 - add_only p, od, k3, 10 - add_only p, ev, k3, 11 - add_only p, od - - /* update state */ - add dgbv.2s, dgbv.2s, dg1v.2s - add dgav.4s, dgav.4s, dg0v.4s - - cbz w2, 2f - cond_yield 3f, x5, x6 - b 0b - - /* - * Final block: add padding and total bit count. - * Skip if the input size was not a round multiple of the block size, - * the padding is handled by the C code in that case. - */ -2: cbz x4, 3f - ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x0, x4] - movi v9.2d, #0 - mov x8, #0x80000000 - movi v10.2d, #0 - ror x7, x4, #29 // ror(lsl(x4, 3), 32) - fmov d8, x8 - mov x4, #0 - mov v11.d[0], xzr - mov v11.d[1], x7 - b 1b - - /* store new state */ -3: st1 {dgav.4s}, [x0] - str dgb, [x0, #16] - mov w0, w2 - ret -SYM_FUNC_END(__sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c deleted file mode 100644 index 65b6980817e5..000000000000 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ /dev/null @@ -1,118 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2014 - 2017 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha1"); - -struct sha1_ce_state { - struct sha1_state sst; - u32 finalize; -}; - -extern const u32 sha1_ce_offsetof_count; -extern const u32 sha1_ce_offsetof_finalize; - -asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); - -static void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - int blocks) -{ - while (blocks) { - int rem; - - kernel_neon_begin(); - rem = __sha1_ce_transform(container_of(sst, - struct sha1_ce_state, - sst), src, blocks); - kernel_neon_end(); - src += (blocks - rem) * SHA1_BLOCK_SIZE; - blocks = rem; - } -} - -const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); -const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); - -static int sha1_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - - sctx->finalize = 0; - return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); -} - -static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - bool finalized = false; - - /* - * Allow the asm code to perform the finalization if there is no - * partial data and the input is a round multiple of the block size. - */ - if (len >= SHA1_BLOCK_SIZE) { - unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE); - - finalized = !remain; - sctx->finalize = finalized; - sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); - data += len - remain; - len = remain; - } - if (!finalized) { - sctx->finalize = 0; - sha1_base_do_finup(desc, data, len, sha1_ce_transform); - } - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .init = sha1_base_init, - .update = sha1_ce_update, - .finup = sha1_ce_finup, - .descsize = sizeof(struct sha1_ce_state), - .statesize = SHA1_STATE_SIZE, - .digestsize = SHA1_DIGEST_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_ce_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_ce_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(SHA1, sha1_ce_mod_init); -module_exit(sha1_ce_mod_fini); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 519c5d6a050f..05cce143af31 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -147,6 +147,7 @@ config CRYPTO_LIB_SHA1_ARCH bool depends on CRYPTO_LIB_SHA1 && !UML default y if ARM + default y if ARM64 && KERNEL_MODE_NEON config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 699a42133927..1da13c9e2f71 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -76,6 +76,7 @@ libsha1-y += arm/sha1-armv4-large.o libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \ arm/sha1-ce-core.o endif +libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S new file mode 100644 index 000000000000..21efbbafd7d6 --- /dev/null +++ b/lib/crypto/arm64/sha1-ce-core.S @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd + */ + +#include +#include + + .text + .arch armv8-a+crypto + + k0 .req v0 + k1 .req v1 + k2 .req v2 + k3 .req v3 + + t0 .req v4 + t1 .req v5 + + dga .req q6 + dgav .req v6 + dgb .req s7 + dgbv .req v7 + + dg0q .req q12 + dg0s .req s12 + dg0v .req v12 + dg1s .req s13 + dg1v .req v13 + dg2s .req s14 + + .macro add_only, op, ev, rc, s0, dg1 + .ifc \ev, ev + add t1.4s, v\s0\().4s, \rc\().4s + sha1h dg2s, dg0s + .ifnb \dg1 + sha1\op dg0q, \dg1, t0.4s + .else + sha1\op dg0q, dg1s, t0.4s + .endif + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha1h dg1s, dg0s + sha1\op dg0q, dg2s, t1.4s + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1 v\s0\().4s, v\s3\().4s + .endm + + .macro loadrc, k, val, tmp + movz \tmp, :abs_g0_nc:\val + movk \tmp, :abs_g1:\val + dup \k, \tmp + .endm + + /* + * size_t __sha1_ce_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SYM_FUNC_START(__sha1_ce_transform) + /* load round constants */ + loadrc k0.4s, 0x5a827999, w6 + loadrc k1.4s, 0x6ed9eba1, w6 + loadrc k2.4s, 0x8f1bbcdc, w6 + loadrc k3.4s, 0xca62c1d6, w6 + + /* load state */ + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] + + /* load input */ +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub x2, x2, #1 + +CPU_LE( rev32 v8.16b, v8.16b ) +CPU_LE( rev32 v9.16b, v9.16b ) +CPU_LE( rev32 v10.16b, v10.16b ) +CPU_LE( rev32 v11.16b, v11.16b ) + + add t0.4s, v8.4s, k0.4s + mov dg0v.16b, dgav.16b + + add_update c, ev, k0, 8, 9, 10, 11, dgb + add_update c, od, k0, 9, 10, 11, 8 + add_update c, ev, k0, 10, 11, 8, 9 + add_update c, od, k0, 11, 8, 9, 10 + add_update c, ev, k1, 8, 9, 10, 11 + + add_update p, od, k1, 9, 10, 11, 8 + add_update p, ev, k1, 10, 11, 8, 9 + add_update p, od, k1, 11, 8, 9, 10 + add_update p, ev, k1, 8, 9, 10, 11 + add_update p, od, k2, 9, 10, 11, 8 + + add_update m, ev, k2, 10, 11, 8, 9 + add_update m, od, k2, 11, 8, 9, 10 + add_update m, ev, k2, 8, 9, 10, 11 + add_update m, od, k2, 9, 10, 11, 8 + add_update m, ev, k3, 10, 11, 8, 9 + + add_update p, od, k3, 11, 8, 9, 10 + add_only p, ev, k3, 9 + add_only p, od, k3, 10 + add_only p, ev, k3, 11 + add_only p, od + + /* update state */ + add dgbv.2s, dgbv.2s, dg1v.2s + add dgav.4s, dgav.4s, dg0v.4s + + /* return early if voluntary preemption is needed */ + cond_yield 1f, x5, x6 + + /* handled all input blocks? */ + cbnz x2, 0b + + /* store new state */ +1: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov x0, x2 + ret +SYM_FUNC_END(__sha1_ce_transform) diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h new file mode 100644 index 000000000000..f822563538cc --- /dev/null +++ b/lib/crypto/arm64/sha1.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_ce) && likely(may_use_simd())) { + do { + size_t rem; + + kernel_neon_begin(); + rem = __sha1_ce_transform(state, data, nblocks); + kernel_neon_end(); + data += (nblocks - rem) * SHA1_BLOCK_SIZE; + nblocks = rem; + } while (nblocks); + } else { + sha1_blocks_generic(state, data, nblocks); + } +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (cpu_have_named_feature(SHA1)) + static_branch_enable(&have_ce); +} -- cgit v1.2.3 From b6ac1dac2f18d1f9714804654ad0643d5aeef4d5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:00 -0700 Subject: lib/crypto: mips/sha1: Migrate optimized code into library Instead of exposing the mips-optimized SHA-1 code via mips-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be mips-optimized, and it fixes the longstanding issue where the mips-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/mips/cavium-octeon/crypto/octeon-sha1.c to lib/crypto/mips/sha1.h, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/mips/cavium-octeon/crypto/Makefile | 1 - arch/mips/cavium-octeon/crypto/octeon-sha1.c | 146 --------------------------- arch/mips/configs/cavium_octeon_defconfig | 1 - arch/mips/crypto/Kconfig | 10 -- lib/crypto/Kconfig | 1 + lib/crypto/mips/sha1.h | 81 +++++++++++++++ 6 files changed, 82 insertions(+), 158 deletions(-) delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-sha1.c create mode 100644 lib/crypto/mips/sha1.h diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile index db428e4b30bc..83f2f5dd93cc 100644 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ b/arch/mips/cavium-octeon/crypto/Makefile @@ -6,4 +6,3 @@ obj-y += octeon-crypto.o obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o -obj-$(CONFIG_CRYPTO_SHA1_OCTEON) += octeon-sha1.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c deleted file mode 100644 index e4a369a7764f..000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * SHA1 Secure Hash Algorithm. - * - * Adapted for OCTEON by Aaro Koskinen . - * - * Based on crypto/sha1_generic.c, which is: - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * We pass everything as 64-bit. OCTEON can handle misaligned data. - */ - -static void octeon_sha1_store_hash(struct sha1_state *sctx) -{ - u64 *hash = (u64 *)sctx->state; - union { - u32 word[2]; - u64 dword; - } hash_tail = { { sctx->state[4], } }; - - write_octeon_64bit_hash_dword(hash[0], 0); - write_octeon_64bit_hash_dword(hash[1], 1); - write_octeon_64bit_hash_dword(hash_tail.dword, 2); - memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0])); -} - -static void octeon_sha1_read_hash(struct sha1_state *sctx) -{ - u64 *hash = (u64 *)sctx->state; - union { - u32 word[2]; - u64 dword; - } hash_tail; - - hash[0] = read_octeon_64bit_hash_dword(0); - hash[1] = read_octeon_64bit_hash_dword(1); - hash_tail.dword = read_octeon_64bit_hash_dword(2); - sctx->state[4] = hash_tail.word[0]; - memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword)); -} - -static void octeon_sha1_transform(struct sha1_state *sctx, const u8 *src, - int blocks) -{ - do { - const u64 *block = (const u64 *)src; - - write_octeon_64bit_block_dword(block[0], 0); - write_octeon_64bit_block_dword(block[1], 1); - write_octeon_64bit_block_dword(block[2], 2); - write_octeon_64bit_block_dword(block[3], 3); - write_octeon_64bit_block_dword(block[4], 4); - write_octeon_64bit_block_dword(block[5], 5); - write_octeon_64bit_block_dword(block[6], 6); - octeon_sha1_start(block[7]); - - src += SHA1_BLOCK_SIZE; - } while (--blocks); -} - -static int octeon_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - int remain; - - flags = octeon_crypto_enable(&state); - octeon_sha1_store_hash(sctx); - - remain = sha1_base_do_update_blocks(desc, data, len, - octeon_sha1_transform); - - octeon_sha1_read_hash(sctx); - octeon_crypto_disable(&state, flags); - return remain; -} - -static int octeon_sha1_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - - flags = octeon_crypto_enable(&state); - octeon_sha1_store_hash(sctx); - - sha1_base_do_finup(desc, src, len, octeon_sha1_transform); - - octeon_sha1_read_hash(sctx); - octeon_crypto_disable(&state, flags); - return sha1_base_finish(desc, out); -} - -static struct shash_alg octeon_sha1_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = octeon_sha1_update, - .finup = octeon_sha1_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "octeon-sha1", - .cra_priority = OCTEON_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init octeon_sha1_mod_init(void) -{ - if (!octeon_has_crypto()) - return -ENOTSUPP; - return crypto_register_shash(&octeon_sha1_alg); -} - -static void __exit octeon_sha1_mod_fini(void) -{ - crypto_unregister_shash(&octeon_sha1_alg); -} - -module_init(octeon_sha1_mod_init); -module_exit(octeon_sha1_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (OCTEON)"); -MODULE_AUTHOR("Aaro Koskinen "); diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index effdfb2bb738..3f50e1d78894 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -156,7 +156,6 @@ CONFIG_SECURITY_NETWORK=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MD5_OCTEON=y -CONFIG_CRYPTO_SHA1_OCTEON=m CONFIG_CRYPTO_DES=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig index 51a76a5ee3b1..7b91f4ec65bf 100644 --- a/arch/mips/crypto/Kconfig +++ b/arch/mips/crypto/Kconfig @@ -12,14 +12,4 @@ config CRYPTO_MD5_OCTEON Architecture: mips OCTEON using crypto instructions, when available -config CRYPTO_SHA1_OCTEON - tristate "Hash functions: SHA-1 (OCTEON)" - depends on CPU_CAVIUM_OCTEON - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: mips OCTEON - endmenu diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 05cce143af31..7c9dc432fb42 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -148,6 +148,7 @@ config CRYPTO_LIB_SHA1_ARCH depends on CRYPTO_LIB_SHA1 && !UML default y if ARM default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS && CPU_CAVIUM_OCTEON config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h new file mode 100644 index 000000000000..ba1965002e4a --- /dev/null +++ b/lib/crypto/mips/sha1.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Cryptographic API. + * + * SHA1 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen . + * + * Based on crypto/sha1_generic.c, which is: + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + */ + +#include +#include + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void octeon_sha1_store_hash(struct sha1_block_state *state) +{ + u64 *hash = (u64 *)&state->h[0]; + union { + u32 word[2]; + u64 dword; + } hash_tail = { { state->h[4], } }; + + write_octeon_64bit_hash_dword(hash[0], 0); + write_octeon_64bit_hash_dword(hash[1], 1); + write_octeon_64bit_hash_dword(hash_tail.dword, 2); + memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0])); +} + +static void octeon_sha1_read_hash(struct sha1_block_state *state) +{ + u64 *hash = (u64 *)&state->h[0]; + union { + u32 word[2]; + u64 dword; + } hash_tail; + + hash[0] = read_octeon_64bit_hash_dword(0); + hash[1] = read_octeon_64bit_hash_dword(1); + hash_tail.dword = read_octeon_64bit_hash_dword(2); + state->h[4] = hash_tail.word[0]; + memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword)); +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + unsigned long flags; + + if (!octeon_has_crypto()) + return sha1_blocks_generic(state, data, nblocks); + + flags = octeon_crypto_enable(&cop2_state); + octeon_sha1_store_hash(state); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha1_start(block[7]); + + data += SHA1_BLOCK_SIZE; + } while (--nblocks); + + octeon_sha1_read_hash(state); + octeon_crypto_disable(&cop2_state, flags); +} -- cgit v1.2.3 From 6b9ae8cfaa7abc65f9fc8bd93f0c707c31b7ce85 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:01 -0700 Subject: lib/crypto: powerpc/sha1: Migrate optimized code into library Instead of exposing the powerpc-optimized SHA-1 code via powerpc-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be powerpc-optimized, and it fixes the longstanding issue where the powerpc-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/powerpc/crypto/sha1-spe-glue.c to lib/crypto/powerpc/sha1.h, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/configs/44x/akebono_defconfig | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/crypto/Kconfig | 16 -- arch/powerpc/crypto/Makefile | 4 - arch/powerpc/crypto/sha1-powerpc-asm.S | 188 ------------------ arch/powerpc/crypto/sha1-spe-asm.S | 294 ----------------------------- arch/powerpc/crypto/sha1-spe-glue.c | 107 ----------- arch/powerpc/crypto/sha1.c | 78 -------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 4 + lib/crypto/powerpc/sha1-powerpc-asm.S | 188 ++++++++++++++++++ lib/crypto/powerpc/sha1-spe-asm.S | 294 +++++++++++++++++++++++++++++ lib/crypto/powerpc/sha1.h | 67 +++++++ 14 files changed, 554 insertions(+), 690 deletions(-) delete mode 100644 arch/powerpc/crypto/sha1-powerpc-asm.S delete mode 100644 arch/powerpc/crypto/sha1-spe-asm.S delete mode 100644 arch/powerpc/crypto/sha1-spe-glue.c delete mode 100644 arch/powerpc/crypto/sha1.c create mode 100644 lib/crypto/powerpc/sha1-powerpc-asm.S create mode 100644 lib/crypto/powerpc/sha1-spe-asm.S create mode 100644 lib/crypto/powerpc/sha1.h diff --git a/arch/powerpc/configs/44x/akebono_defconfig b/arch/powerpc/configs/44x/akebono_defconfig index fde4824f235e..1882eb2da354 100644 --- a/arch/powerpc/configs/44x/akebono_defconfig +++ b/arch/powerpc/configs/44x/akebono_defconfig @@ -128,6 +128,5 @@ CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW=0x00010000 CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH=0x33f CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_SHA1_PPC=y CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 379229c982a4..98f56e63ad21 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -322,7 +322,6 @@ CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_ANUBIS=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 3423c405cad4..dca67aae5da3 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -388,7 +388,6 @@ CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_MD5_PPC=m -CONFIG_CRYPTO_SHA1_PPC=m CONFIG_CRYPTO_AES_GCM_P10=m CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index caaa359f4742..cfe39fc221cf 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -23,22 +23,6 @@ config CRYPTO_MD5_PPC Architecture: powerpc -config CRYPTO_SHA1_PPC - tristate "Hash functions: SHA-1" - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: powerpc - -config CRYPTO_SHA1_PPC_SPE - tristate "Hash functions: SHA-1 (SPE)" - depends on SPE - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: powerpc using - - SPE (Signal Processing Engine) extensions - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" depends on SPE diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 8c2936ae466f..bc8fd27344b8 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -7,16 +7,12 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o -obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o -obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o -sha1-powerpc-y := sha1-powerpc-asm.o sha1.o -sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S deleted file mode 100644 index f0d5ed557ab1..000000000000 --- a/arch/powerpc/crypto/sha1-powerpc-asm.S +++ /dev/null @@ -1,188 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * SHA-1 implementation for PowerPC. - * - * Copyright (C) 2005 Paul Mackerras - */ - -#include -#include -#include - -#ifdef __BIG_ENDIAN__ -#define LWZ(rt, d, ra) \ - lwz rt,d(ra) -#else -#define LWZ(rt, d, ra) \ - li rt,d; \ - lwbrx rt,rt,ra -#endif - -/* - * We roll the registers for T, A, B, C, D, E around on each - * iteration; T on iteration t is A on iteration t+1, and so on. - * We use registers 7 - 12 for this. - */ -#define RT(t) ((((t)+5)%6)+7) -#define RA(t) ((((t)+4)%6)+7) -#define RB(t) ((((t)+3)%6)+7) -#define RC(t) ((((t)+2)%6)+7) -#define RD(t) ((((t)+1)%6)+7) -#define RE(t) ((((t)+0)%6)+7) - -/* We use registers 16 - 31 for the W values */ -#define W(t) (((t)%16)+16) - -#define LOADW(t) \ - LWZ(W(t),(t)*4,r4) - -#define STEPD0_LOAD(t) \ - andc r0,RD(t),RB(t); \ - and r6,RB(t),RC(t); \ - rotlwi RT(t),RA(t),5; \ - or r6,r6,r0; \ - add r0,RE(t),r15; \ - add RT(t),RT(t),r6; \ - add r14,r0,W(t); \ - LWZ(W((t)+4),((t)+4)*4,r4); \ - rotlwi RB(t),RB(t),30; \ - add RT(t),RT(t),r14 - -#define STEPD0_UPDATE(t) \ - and r6,RB(t),RC(t); \ - andc r0,RD(t),RB(t); \ - rotlwi RT(t),RA(t),5; \ - rotlwi RB(t),RB(t),30; \ - or r6,r6,r0; \ - add r0,RE(t),r15; \ - xor r5,W((t)+4-3),W((t)+4-8); \ - add RT(t),RT(t),r6; \ - xor W((t)+4),W((t)+4-16),W((t)+4-14); \ - add r0,r0,W(t); \ - xor W((t)+4),W((t)+4),r5; \ - add RT(t),RT(t),r0; \ - rotlwi W((t)+4),W((t)+4),1 - -#define STEPD1(t) \ - xor r6,RB(t),RC(t); \ - rotlwi RT(t),RA(t),5; \ - rotlwi RB(t),RB(t),30; \ - xor r6,r6,RD(t); \ - add r0,RE(t),r15; \ - add RT(t),RT(t),r6; \ - add r0,r0,W(t); \ - add RT(t),RT(t),r0 - -#define STEPD1_UPDATE(t) \ - xor r6,RB(t),RC(t); \ - rotlwi RT(t),RA(t),5; \ - rotlwi RB(t),RB(t),30; \ - xor r6,r6,RD(t); \ - add r0,RE(t),r15; \ - xor r5,W((t)+4-3),W((t)+4-8); \ - add RT(t),RT(t),r6; \ - xor W((t)+4),W((t)+4-16),W((t)+4-14); \ - add r0,r0,W(t); \ - xor W((t)+4),W((t)+4),r5; \ - add RT(t),RT(t),r0; \ - rotlwi W((t)+4),W((t)+4),1 - -#define STEPD2_UPDATE(t) \ - and r6,RB(t),RC(t); \ - and r0,RB(t),RD(t); \ - rotlwi RT(t),RA(t),5; \ - or r6,r6,r0; \ - rotlwi RB(t),RB(t),30; \ - and r0,RC(t),RD(t); \ - xor r5,W((t)+4-3),W((t)+4-8); \ - or r6,r6,r0; \ - xor W((t)+4),W((t)+4-16),W((t)+4-14); \ - add r0,RE(t),r15; \ - add RT(t),RT(t),r6; \ - add r0,r0,W(t); \ - xor W((t)+4),W((t)+4),r5; \ - add RT(t),RT(t),r0; \ - rotlwi W((t)+4),W((t)+4),1 - -#define STEP0LD4(t) \ - STEPD0_LOAD(t); \ - STEPD0_LOAD((t)+1); \ - STEPD0_LOAD((t)+2); \ - STEPD0_LOAD((t)+3) - -#define STEPUP4(t, fn) \ - STEP##fn##_UPDATE(t); \ - STEP##fn##_UPDATE((t)+1); \ - STEP##fn##_UPDATE((t)+2); \ - STEP##fn##_UPDATE((t)+3) - -#define STEPUP20(t, fn) \ - STEPUP4(t, fn); \ - STEPUP4((t)+4, fn); \ - STEPUP4((t)+8, fn); \ - STEPUP4((t)+12, fn); \ - STEPUP4((t)+16, fn) - -_GLOBAL(powerpc_sha_transform) - PPC_STLU r1,-INT_FRAME_SIZE(r1) - SAVE_GPRS(14, 31, r1) - - /* Load up A - E */ - lwz RA(0),0(r3) /* A */ - lwz RB(0),4(r3) /* B */ - lwz RC(0),8(r3) /* C */ - lwz RD(0),12(r3) /* D */ - lwz RE(0),16(r3) /* E */ - - LOADW(0) - LOADW(1) - LOADW(2) - LOADW(3) - - lis r15,0x5a82 /* K0-19 */ - ori r15,r15,0x7999 - STEP0LD4(0) - STEP0LD4(4) - STEP0LD4(8) - STEPUP4(12, D0) - STEPUP4(16, D0) - - lis r15,0x6ed9 /* K20-39 */ - ori r15,r15,0xeba1 - STEPUP20(20, D1) - - lis r15,0x8f1b /* K40-59 */ - ori r15,r15,0xbcdc - STEPUP20(40, D2) - - lis r15,0xca62 /* K60-79 */ - ori r15,r15,0xc1d6 - STEPUP4(60, D1) - STEPUP4(64, D1) - STEPUP4(68, D1) - STEPUP4(72, D1) - lwz r20,16(r3) - STEPD1(76) - lwz r19,12(r3) - STEPD1(77) - lwz r18,8(r3) - STEPD1(78) - lwz r17,4(r3) - STEPD1(79) - - lwz r16,0(r3) - add r20,RE(80),r20 - add RD(0),RD(80),r19 - add RC(0),RC(80),r18 - add RB(0),RB(80),r17 - add RA(0),RA(80),r16 - mr RE(0),r20 - stw RA(0),0(r3) - stw RB(0),4(r3) - stw RC(0),8(r3) - stw RD(0),12(r3) - stw RE(0),16(r3) - - REST_GPRS(14, 31, r1) - addi r1,r1,INT_FRAME_SIZE - blr diff --git a/arch/powerpc/crypto/sha1-spe-asm.S b/arch/powerpc/crypto/sha1-spe-asm.S deleted file mode 100644 index 0f447523be5e..000000000000 --- a/arch/powerpc/crypto/sha1-spe-asm.S +++ /dev/null @@ -1,294 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Fast SHA-1 implementation for SPE instruction set (PPC) - * - * This code makes use of the SPE SIMD instruction set as defined in - * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf - * Implementation is based on optimization guide notes from - * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include - -#define rHP r3 /* pointer to hash value */ -#define rWP r4 /* pointer to input */ -#define rKP r5 /* pointer to constants */ - -#define rW0 r14 /* 64 bit round words */ -#define rW1 r15 -#define rW2 r16 -#define rW3 r17 -#define rW4 r18 -#define rW5 r19 -#define rW6 r20 -#define rW7 r21 - -#define rH0 r6 /* 32 bit hash values */ -#define rH1 r7 -#define rH2 r8 -#define rH3 r9 -#define rH4 r10 - -#define rT0 r22 /* 64 bit temporary */ -#define rT1 r0 /* 32 bit temporaries */ -#define rT2 r11 -#define rT3 r12 - -#define rK r23 /* 64 bit constant in volatile register */ - -#define LOAD_K01 - -#define LOAD_K11 \ - evlwwsplat rK,0(rKP); - -#define LOAD_K21 \ - evlwwsplat rK,4(rKP); - -#define LOAD_K31 \ - evlwwsplat rK,8(rKP); - -#define LOAD_K41 \ - evlwwsplat rK,12(rKP); - -#define INITIALIZE \ - stwu r1,-128(r1); /* create stack frame */ \ - evstdw r14,8(r1); /* We must save non volatile */ \ - evstdw r15,16(r1); /* registers. Take the chance */ \ - evstdw r16,24(r1); /* and save the SPE part too */ \ - evstdw r17,32(r1); \ - evstdw r18,40(r1); \ - evstdw r19,48(r1); \ - evstdw r20,56(r1); \ - evstdw r21,64(r1); \ - evstdw r22,72(r1); \ - evstdw r23,80(r1); - - -#define FINALIZE \ - evldw r14,8(r1); /* restore SPE registers */ \ - evldw r15,16(r1); \ - evldw r16,24(r1); \ - evldw r17,32(r1); \ - evldw r18,40(r1); \ - evldw r19,48(r1); \ - evldw r20,56(r1); \ - evldw r21,64(r1); \ - evldw r22,72(r1); \ - evldw r23,80(r1); \ - xor r0,r0,r0; \ - stw r0,8(r1); /* Delete sensitive data */ \ - stw r0,16(r1); /* that we might have pushed */ \ - stw r0,24(r1); /* from other context that runs */ \ - stw r0,32(r1); /* the same code. Assume that */ \ - stw r0,40(r1); /* the lower part of the GPRs */ \ - stw r0,48(r1); /* were already overwritten on */ \ - stw r0,56(r1); /* the way down to here */ \ - stw r0,64(r1); \ - stw r0,72(r1); \ - stw r0,80(r1); \ - addi r1,r1,128; /* cleanup stack frame */ - -#ifdef __BIG_ENDIAN__ -#define LOAD_DATA(reg, off) \ - lwz reg,off(rWP); /* load data */ -#define NEXT_BLOCK \ - addi rWP,rWP,64; /* increment per block */ -#else -#define LOAD_DATA(reg, off) \ - lwbrx reg,0,rWP; /* load data */ \ - addi rWP,rWP,4; /* increment per word */ -#define NEXT_BLOCK /* nothing to do */ -#endif - -#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ - LOAD_DATA(w0, off) /* 1: W */ \ - and rT2,b,c; /* 1: F' = B and C */ \ - LOAD_K##k##1 \ - andc rT1,d,b; /* 1: F" = ~B and D */ \ - rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ - or rT2,rT2,rT1; /* 1: F = F' or F" */ \ - add e,e,rT0; /* 1: E = E + A' */ \ - rotrwi b,b,2; /* 1: B = B rotl 30 */ \ - add e,e,w0; /* 1: E = E + W */ \ - LOAD_DATA(w1, off+4) /* 2: W */ \ - add e,e,rT2; /* 1: E = E + F */ \ - and rT1,a,b; /* 2: F' = B and C */ \ - add e,e,rK; /* 1: E = E + K */ \ - andc rT2,c,a; /* 2: F" = ~B and D */ \ - add d,d,rK; /* 2: E = E + K */ \ - or rT2,rT2,rT1; /* 2: F = F' or F" */ \ - rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ - add d,d,w1; /* 2: E = E + W */ \ - rotrwi a,a,2; /* 2: B = B rotl 30 */ \ - add d,d,rT0; /* 2: E = E + A' */ \ - evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ - add d,d,rT2 /* 2: E = E + F */ - -#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ - and rT2,b,c; /* 1: F' = B and C */ \ - evmergelohi rT0,w7,w6; /* W[-3] */ \ - andc rT1,d,b; /* 1: F" = ~B and D */ \ - evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ - or rT1,rT1,rT2; /* 1: F = F' or F" */ \ - evxor w0,w0,w4; /* W = W xor W[-8] */ \ - add e,e,rT1; /* 1: E = E + F */ \ - evxor w0,w0,w1; /* W = W xor W[-14] */ \ - rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ - evrlwi w0,w0,1; /* W = W rotl 1 */ \ - add e,e,rT2; /* 1: E = E + A' */ \ - evaddw rT0,w0,rK; /* WK = W + K */ \ - rotrwi b,b,2; /* 1: B = B rotl 30 */ \ - LOAD_K##k##1 \ - evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ - add e,e,rT0; /* 1: E = E + WK */ \ - add d,d,rT1; /* 2: E = E + WK */ \ - and rT2,a,b; /* 2: F' = B and C */ \ - andc rT1,c,a; /* 2: F" = ~B and D */ \ - rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ - or rT1,rT1,rT2; /* 2: F = F' or F" */ \ - add d,d,rT0; /* 2: E = E + A' */ \ - rotrwi a,a,2; /* 2: B = B rotl 30 */ \ - add d,d,rT1 /* 2: E = E + F */ - -#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ - evmergelohi rT0,w7,w6; /* W[-3] */ \ - xor rT2,b,c; /* 1: F' = B xor C */ \ - evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ - xor rT2,rT2,d; /* 1: F = F' xor D */ \ - evxor w0,w0,w4; /* W = W xor W[-8] */ \ - add e,e,rT2; /* 1: E = E + F */ \ - evxor w0,w0,w1; /* W = W xor W[-14] */ \ - rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ - evrlwi w0,w0,1; /* W = W rotl 1 */ \ - add e,e,rT2; /* 1: E = E + A' */ \ - evaddw rT0,w0,rK; /* WK = W + K */ \ - rotrwi b,b,2; /* 1: B = B rotl 30 */ \ - LOAD_K##k##1 \ - evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ - add e,e,rT0; /* 1: E = E + WK */ \ - xor rT2,a,b; /* 2: F' = B xor C */ \ - add d,d,rT1; /* 2: E = E + WK */ \ - xor rT2,rT2,c; /* 2: F = F' xor D */ \ - rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ - add d,d,rT2; /* 2: E = E + F */ \ - rotrwi a,a,2; /* 2: B = B rotl 30 */ \ - add d,d,rT0 /* 2: E = E + A' */ - -#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ - and rT2,b,c; /* 1: F' = B and C */ \ - evmergelohi rT0,w7,w6; /* W[-3] */ \ - or rT1,b,c; /* 1: F" = B or C */ \ - evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ - and rT1,d,rT1; /* 1: F" = F" and D */ \ - evxor w0,w0,w4; /* W = W xor W[-8] */ \ - or rT2,rT2,rT1; /* 1: F = F' or F" */ \ - evxor w0,w0,w1; /* W = W xor W[-14] */ \ - add e,e,rT2; /* 1: E = E + F */ \ - evrlwi w0,w0,1; /* W = W rotl 1 */ \ - rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ - evaddw rT0,w0,rK; /* WK = W + K */ \ - add e,e,rT2; /* 1: E = E + A' */ \ - LOAD_K##k##1 \ - evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ - rotrwi b,b,2; /* 1: B = B rotl 30 */ \ - add e,e,rT0; /* 1: E = E + WK */ \ - and rT2,a,b; /* 2: F' = B and C */ \ - or rT0,a,b; /* 2: F" = B or C */ \ - add d,d,rT1; /* 2: E = E + WK */ \ - and rT0,c,rT0; /* 2: F" = F" and D */ \ - rotrwi a,a,2; /* 2: B = B rotl 30 */ \ - or rT2,rT2,rT0; /* 2: F = F' or F" */ \ - rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ - add d,d,rT2; /* 2: E = E + F */ \ - add d,d,rT0 /* 2: E = E + A' */ - -#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ - R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) - -_GLOBAL(ppc_spe_sha1_transform) - INITIALIZE - - lwz rH0,0(rHP) - lwz rH1,4(rHP) - mtctr r5 - lwz rH2,8(rHP) - lis rKP,PPC_SPE_SHA1_K@h - lwz rH3,12(rHP) - ori rKP,rKP,PPC_SPE_SHA1_K@l - lwz rH4,16(rHP) - -ppc_spe_sha1_main: - R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) - R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) - R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) - R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) - R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) - R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) - R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) - R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) - - R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) - R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) - - R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) - R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) - R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) - R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) - R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) - R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) - R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) - R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) - R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) - R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) - - R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) - R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) - R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) - R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) - R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) - R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) - R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) - R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) - R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) - R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) - - R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) - R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) - R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) - R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) - R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) - R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) - R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) - lwz rT3,0(rHP) - R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) - lwz rW1,4(rHP) - R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) - lwz rW2,8(rHP) - R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) - lwz rW3,12(rHP) - NEXT_BLOCK - lwz rW4,16(rHP) - - add rH0,rH0,rT3 - stw rH0,0(rHP) - add rH1,rH1,rW1 - stw rH1,4(rHP) - add rH2,rH2,rW2 - stw rH2,8(rHP) - add rH3,rH3,rW3 - stw rH3,12(rHP) - add rH4,rH4,rW4 - stw rH4,16(rHP) - - bdnz ppc_spe_sha1_main - - FINALIZE - blr - -.data -.align 4 -PPC_SPE_SHA1_K: - .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c deleted file mode 100644 index 04c88e173ce1..000000000000 --- a/arch/powerpc/crypto/sha1-spe-glue.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for SHA-1 implementation for SPE instructions (PPC) - * - * Based on generic implementation. - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * MAX_BYTES defines the number of bytes that are allowed to be processed - * between preempt_disable() and preempt_enable(). SHA1 takes ~1000 - * operations per 64 bytes. e500 cores can issue two arithmetic instructions - * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). - * Thus 2KB of input data will need an estimated maximum of 18,000 cycles. - * Headroom for cache misses included. Even with the low end model clocked - * at 667 MHz this equals to a critical time window of less than 27us. - * - */ -#define MAX_BYTES 2048 - -asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks); - -static void spe_begin(void) -{ - /* We just start SPE operations and will save SPE registers later. */ - preempt_disable(); - enable_kernel_spe(); -} - -static void spe_end(void) -{ - disable_kernel_spe(); - /* reenable preemption */ - preempt_enable(); -} - -static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src, - int blocks) -{ - do { - int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE); - - spe_begin(); - ppc_spe_sha1_transform(sctx->state, src, unit); - spe_end(); - - src += unit * SHA1_BLOCK_SIZE; - blocks -= unit; - } while (blocks); -} - -static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block); -} - -static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block); - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = ppc_spe_sha1_update, - .finup = ppc_spe_sha1_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-ppc-spe", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init ppc_spe_sha1_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit ppc_spe_sha1_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(ppc_spe_sha1_mod_init); -module_exit(ppc_spe_sha1_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, SPE optimized"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-ppc-spe"); diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c deleted file mode 100644 index 4593946aa9b3..000000000000 --- a/arch/powerpc/crypto/sha1.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * powerpc implementation of the SHA1 Secure Hash Algorithm. - * - * Derived from cryptoapi implementation, adapted for in-place - * scatterlist interface. - * - * Derived from "crypto/sha1.c" - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - */ -#include -#include -#include -#include -#include - -asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src); - -static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data, - int blocks) -{ - do { - powerpc_sha_transform(sctx->state, data); - data += 64; - } while (--blocks); -} - -static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block); -} - -/* Add padding and return the message digest. */ -static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - sha1_base_do_finup(desc, src, len, powerpc_sha_block); - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = powerpc_sha1_update, - .finup = powerpc_sha1_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-powerpc", - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_powerpc_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_powerpc_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(sha1_powerpc_mod_init); -module_exit(sha1_powerpc_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-powerpc"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 7c9dc432fb42..22ffbdab82d6 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -149,6 +149,7 @@ config CRYPTO_LIB_SHA1_ARCH default y if ARM default y if ARM64 && KERNEL_MODE_NEON default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 1da13c9e2f71..02f672562928 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -77,6 +77,10 @@ libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \ arm/sha1-ce-core.o endif libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o +ifeq ($(CONFIG_PPC),y) +libsha1-y += powerpc/sha1-powerpc-asm.o +libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o +endif endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S new file mode 100644 index 000000000000..f0d5ed557ab1 --- /dev/null +++ b/lib/crypto/powerpc/sha1-powerpc-asm.S @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SHA-1 implementation for PowerPC. + * + * Copyright (C) 2005 Paul Mackerras + */ + +#include +#include +#include + +#ifdef __BIG_ENDIAN__ +#define LWZ(rt, d, ra) \ + lwz rt,d(ra) +#else +#define LWZ(rt, d, ra) \ + li rt,d; \ + lwbrx rt,rt,ra +#endif + +/* + * We roll the registers for T, A, B, C, D, E around on each + * iteration; T on iteration t is A on iteration t+1, and so on. + * We use registers 7 - 12 for this. + */ +#define RT(t) ((((t)+5)%6)+7) +#define RA(t) ((((t)+4)%6)+7) +#define RB(t) ((((t)+3)%6)+7) +#define RC(t) ((((t)+2)%6)+7) +#define RD(t) ((((t)+1)%6)+7) +#define RE(t) ((((t)+0)%6)+7) + +/* We use registers 16 - 31 for the W values */ +#define W(t) (((t)%16)+16) + +#define LOADW(t) \ + LWZ(W(t),(t)*4,r4) + +#define STEPD0_LOAD(t) \ + andc r0,RD(t),RB(t); \ + and r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r14,r0,W(t); \ + LWZ(W((t)+4),((t)+4)*4,r4); \ + rotlwi RB(t),RB(t),30; \ + add RT(t),RT(t),r14 + +#define STEPD0_UPDATE(t) \ + and r6,RB(t),RC(t); \ + andc r0,RD(t),RB(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD1(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + add RT(t),RT(t),r0 + +#define STEPD1_UPDATE(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD2_UPDATE(t) \ + and r6,RB(t),RC(t); \ + and r0,RB(t),RD(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + rotlwi RB(t),RB(t),30; \ + and r0,RC(t),RD(t); \ + xor r5,W((t)+4-3),W((t)+4-8); \ + or r6,r6,r0; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEP0LD4(t) \ + STEPD0_LOAD(t); \ + STEPD0_LOAD((t)+1); \ + STEPD0_LOAD((t)+2); \ + STEPD0_LOAD((t)+3) + +#define STEPUP4(t, fn) \ + STEP##fn##_UPDATE(t); \ + STEP##fn##_UPDATE((t)+1); \ + STEP##fn##_UPDATE((t)+2); \ + STEP##fn##_UPDATE((t)+3) + +#define STEPUP20(t, fn) \ + STEPUP4(t, fn); \ + STEPUP4((t)+4, fn); \ + STEPUP4((t)+8, fn); \ + STEPUP4((t)+12, fn); \ + STEPUP4((t)+16, fn) + +_GLOBAL(powerpc_sha_transform) + PPC_STLU r1,-INT_FRAME_SIZE(r1) + SAVE_GPRS(14, 31, r1) + + /* Load up A - E */ + lwz RA(0),0(r3) /* A */ + lwz RB(0),4(r3) /* B */ + lwz RC(0),8(r3) /* C */ + lwz RD(0),12(r3) /* D */ + lwz RE(0),16(r3) /* E */ + + LOADW(0) + LOADW(1) + LOADW(2) + LOADW(3) + + lis r15,0x5a82 /* K0-19 */ + ori r15,r15,0x7999 + STEP0LD4(0) + STEP0LD4(4) + STEP0LD4(8) + STEPUP4(12, D0) + STEPUP4(16, D0) + + lis r15,0x6ed9 /* K20-39 */ + ori r15,r15,0xeba1 + STEPUP20(20, D1) + + lis r15,0x8f1b /* K40-59 */ + ori r15,r15,0xbcdc + STEPUP20(40, D2) + + lis r15,0xca62 /* K60-79 */ + ori r15,r15,0xc1d6 + STEPUP4(60, D1) + STEPUP4(64, D1) + STEPUP4(68, D1) + STEPUP4(72, D1) + lwz r20,16(r3) + STEPD1(76) + lwz r19,12(r3) + STEPD1(77) + lwz r18,8(r3) + STEPD1(78) + lwz r17,4(r3) + STEPD1(79) + + lwz r16,0(r3) + add r20,RE(80),r20 + add RD(0),RD(80),r19 + add RC(0),RC(80),r18 + add RB(0),RB(80),r17 + add RA(0),RA(80),r16 + mr RE(0),r20 + stw RA(0),0(r3) + stw RB(0),4(r3) + stw RC(0),8(r3) + stw RD(0),12(r3) + stw RE(0),16(r3) + + REST_GPRS(14, 31, r1) + addi r1,r1,INT_FRAME_SIZE + blr diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S new file mode 100644 index 000000000000..0f447523be5e --- /dev/null +++ b/lib/crypto/powerpc/sha1-spe-asm.S @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fast SHA-1 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen + */ + +#include +#include + +#define rHP r3 /* pointer to hash value */ +#define rWP r4 /* pointer to input */ +#define rKP r5 /* pointer to constants */ + +#define rW0 r14 /* 64 bit round words */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rH0 r6 /* 32 bit hash values */ +#define rH1 r7 +#define rH2 r8 +#define rH3 r9 +#define rH4 r10 + +#define rT0 r22 /* 64 bit temporary */ +#define rT1 r0 /* 32 bit temporaries */ +#define rT2 r11 +#define rT3 r12 + +#define rK r23 /* 64 bit constant in volatile register */ + +#define LOAD_K01 + +#define LOAD_K11 \ + evlwwsplat rK,0(rKP); + +#define LOAD_K21 \ + evlwwsplat rK,4(rKP); + +#define LOAD_K31 \ + evlwwsplat rK,8(rKP); + +#define LOAD_K41 \ + evlwwsplat rK,12(rKP); + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* were already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ + LOAD_DATA(w0, off) /* 1: W */ \ + and rT2,b,c; /* 1: F' = B and C */ \ + LOAD_K##k##1 \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + add e,e,rT0; /* 1: E = E + A' */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,w0; /* 1: E = E + W */ \ + LOAD_DATA(w1, off+4) /* 2: W */ \ + add e,e,rT2; /* 1: E = E + F */ \ + and rT1,a,b; /* 2: F' = B and C */ \ + add e,e,rK; /* 1: E = E + K */ \ + andc rT2,c,a; /* 2: F" = ~B and D */ \ + add d,d,rK; /* 2: E = E + K */ \ + or rT2,rT2,rT1; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,w1; /* 2: E = E + W */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ + add d,d,rT2 /* 2: E = E + F */ + +#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + or rT1,rT1,rT2; /* 1: F = F' or F" */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT1; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + andc rT1,c,a; /* 2: F" = ~B and D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + or rT1,rT1,rT2; /* 2: F = F' or F" */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT1 /* 2: E = E + F */ + +#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + xor rT2,b,c; /* 1: F' = B xor C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + xor rT2,rT2,d; /* 1: F = F' xor D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + xor rT2,a,b; /* 2: F' = B xor C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + xor rT2,rT2,c; /* 2: F = F' xor D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + or rT1,b,c; /* 1: F" = B or C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + and rT1,d,rT1; /* 1: F" = F" and D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + or rT0,a,b; /* 2: F" = B or C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT0,c,rT0; /* 2: F" = F" and D */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + or rT2,rT2,rT0; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) + +_GLOBAL(ppc_spe_sha1_transform) + INITIALIZE + + lwz rH0,0(rHP) + lwz rH1,4(rHP) + mtctr r5 + lwz rH2,8(rHP) + lis rKP,PPC_SPE_SHA1_K@h + lwz rH3,12(rHP) + ori rKP,rKP,PPC_SPE_SHA1_K@l + lwz rH4,16(rHP) + +ppc_spe_sha1_main: + R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) + R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) + R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) + R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) + R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) + R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) + R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) + R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) + + R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) + R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) + + R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) + R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) + + R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) + R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) + + R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) + R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) + lwz rT3,0(rHP) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) + lwz rW1,4(rHP) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) + lwz rW2,8(rHP) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) + lwz rW3,12(rHP) + NEXT_BLOCK + lwz rW4,16(rHP) + + add rH0,rH0,rT3 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + + bdnz ppc_spe_sha1_main + + FINALIZE + blr + +.data +.align 4 +PPC_SPE_SHA1_K: + .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h new file mode 100644 index 000000000000..e2c010f0370b --- /dev/null +++ b/lib/crypto/powerpc/sha1.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for PowerPC + * + * Copyright (c) 2015 Markus Stockhausen + */ + +#include +#include + +#ifdef CONFIG_SPE +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA1 takes ~1000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 2KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 2048 + +asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state, + const u8 *data, u32 nblocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha1_transform(state, data, unit); + spe_end(); + + data += unit * SHA1_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} +#else /* CONFIG_SPE */ +asmlinkage void powerpc_sha_transform(struct sha1_block_state *state, + const u8 data[SHA1_BLOCK_SIZE]); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + powerpc_sha_transform(state, data); + data += SHA1_BLOCK_SIZE; + } while (--nblocks); +} +#endif /* !CONFIG_SPE */ -- cgit v1.2.3 From 377982d5618a7b80bf2ad3eed9aa62691e984d50 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:02 -0700 Subject: lib/crypto: s390/sha1: Migrate optimized code into library Instead of exposing the s390-optimized SHA-1 code via s390-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be s390-optimized, and it fixes the longstanding issue where the s390-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/crypto/Kconfig | 10 ---- arch/s390/crypto/Makefile | 1 - arch/s390/crypto/sha1_s390.c | 103 -------------------------------------- lib/crypto/Kconfig | 1 + lib/crypto/s390/sha1.h | 28 +++++++++++ 7 files changed, 29 insertions(+), 116 deletions(-) delete mode 100644 arch/s390/crypto/sha1_s390.c create mode 100644 lib/crypto/s390/sha1.h diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index ef313c30b375..a7db7ed28720 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -804,7 +804,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index b6fa341bb03b..0217ed5616bf 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -791,7 +791,6 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig index 4557514fbac3..03f73fbd38b6 100644 --- a/arch/s390/crypto/Kconfig +++ b/arch/s390/crypto/Kconfig @@ -2,16 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (s390)" -config CRYPTO_SHA1_S390 - tristate "Hash functions: SHA-1" - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: s390 - - It is available as of z990. - config CRYPTO_SHA3_256_S390 tristate "Hash functions: SHA3-224 and SHA3-256" select CRYPTO_HASH diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 473d64c0982a..1e5a1038d491 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -3,7 +3,6 @@ # Cryptographic API # -obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c deleted file mode 100644 index d229cbd2ba22..000000000000 --- a/arch/s390/crypto/sha1_s390.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA1 Secure Hash Algorithm. - * - * Derived from cryptoapi implementation, adapted for in-place - * scatterlist interface. Originally based on the public domain - * implementation written by Steve Reid. - * - * s390 Version: - * Copyright IBM Corp. 2003, 2007 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - * - * Derived from "crypto/sha1_generic.c" - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - */ -#include -#include -#include -#include -#include -#include - -#include "sha.h" - -static int s390_sha1_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA1_H0; - sctx->state[1] = SHA1_H1; - sctx->state[2] = SHA1_H2; - sctx->state[3] = SHA1_H3; - sctx->state[4] = SHA1_H4; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_1; - - return 0; -} - -static int s390_sha1_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha1_state *octx = out; - - octx->count = sctx->count; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - return 0; -} - -static int s390_sha1_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha1_state *ictx = in; - - sctx->count = ictx->count; - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - sctx->func = CPACF_KIMD_SHA_1; - return 0; -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = s390_sha1_init, - .update = s390_sha_update_blocks, - .finup = s390_sha_finup, - .export = s390_sha1_export, - .import = s390_sha1_import, - .descsize = S390_SHA_CTX_SIZE, - .statesize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-s390", - .cra_priority = 300, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_s390_init(void) -{ - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) - return -ENODEV; - return crypto_register_shash(&alg); -} - -static void __exit sha1_s390_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init); -module_exit(sha1_s390_fini); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 22ffbdab82d6..7da0432d8164 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -150,6 +150,7 @@ config CRYPTO_LIB_SHA1_ARCH default y if ARM64 && KERNEL_MODE_NEON default y if MIPS && CPU_CAVIUM_OCTEON default y if PPC + default y if S390 config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h new file mode 100644 index 000000000000..08bd138e881c --- /dev/null +++ b/lib/crypto/s390/sha1.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha1)) + cpacf_kimd(CPACF_KIMD_SHA_1, state, data, + nblocks * SHA1_BLOCK_SIZE); + else + sha1_blocks_generic(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) + static_branch_enable(&have_cpacf_sha1); +} -- cgit v1.2.3 From c751059985e02467c7fa6b14676c1d56d089b3cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:03 -0700 Subject: lib/crypto: sparc/sha1: Migrate optimized code into library Instead of exposing the sparc-optimized SHA-1 code via sparc-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be sparc-optimized, and it fixes the longstanding issue where the sparc-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. Note: to see the diff from arch/sparc/crypto/sha1_glue.c to lib/crypto/sparc/sha1.h, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-13-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/sparc/crypto/Kconfig | 10 ----- arch/sparc/crypto/Makefile | 2 - arch/sparc/crypto/sha1_asm.S | 72 --------------------------------- arch/sparc/crypto/sha1_glue.c | 94 ------------------------------------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + lib/crypto/sparc/sha1.h | 43 ++++++++++++++++++++ lib/crypto/sparc/sha1_asm.S | 72 +++++++++++++++++++++++++++++++++ 8 files changed, 117 insertions(+), 178 deletions(-) delete mode 100644 arch/sparc/crypto/sha1_asm.S delete mode 100644 arch/sparc/crypto/sha1_glue.c create mode 100644 lib/crypto/sparc/sha1.h create mode 100644 lib/crypto/sparc/sha1_asm.S diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig index 9d8da9aef3a4..f5b2e720fec3 100644 --- a/arch/sparc/crypto/Kconfig +++ b/arch/sparc/crypto/Kconfig @@ -26,16 +26,6 @@ config CRYPTO_MD5_SPARC64 Architecture: sparc64 using crypto instructions, when available -config CRYPTO_SHA1_SPARC64 - tristate "Hash functions: SHA-1" - depends on SPARC64 - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: sparc64 - config CRYPTO_AES_SPARC64 tristate "Ciphers: AES, modes: ECB, CBC, CTR" depends on SPARC64 diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 99a7e8fd13bc..0d05a17988c4 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -3,14 +3,12 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o -sha1-sparc64-y := sha1_asm.o sha1_glue.o md5-sparc64-y := md5_asm.o md5_glue.o aes-sparc64-y := aes_asm.o aes_glue.o diff --git a/arch/sparc/crypto/sha1_asm.S b/arch/sparc/crypto/sha1_asm.S deleted file mode 100644 index 00b46bac1b08..000000000000 --- a/arch/sparc/crypto/sha1_asm.S +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include - -ENTRY(sha1_sparc64_transform) - /* %o0 = digest, %o1 = data, %o2 = rounds */ - VISEntryHalf - ld [%o0 + 0x00], %f0 - ld [%o0 + 0x04], %f1 - ld [%o0 + 0x08], %f2 - andcc %o1, 0x7, %g0 - ld [%o0 + 0x0c], %f3 - bne,pn %xcc, 10f - ld [%o0 + 0x10], %f4 - -1: - ldd [%o1 + 0x00], %f8 - ldd [%o1 + 0x08], %f10 - ldd [%o1 + 0x10], %f12 - ldd [%o1 + 0x18], %f14 - ldd [%o1 + 0x20], %f16 - ldd [%o1 + 0x28], %f18 - ldd [%o1 + 0x30], %f20 - ldd [%o1 + 0x38], %f22 - - SHA1 - - subcc %o2, 1, %o2 - bne,pt %xcc, 1b - add %o1, 0x40, %o1 - -5: - st %f0, [%o0 + 0x00] - st %f1, [%o0 + 0x04] - st %f2, [%o0 + 0x08] - st %f3, [%o0 + 0x0c] - st %f4, [%o0 + 0x10] - retl - VISExitHalf -10: - alignaddr %o1, %g0, %o1 - - ldd [%o1 + 0x00], %f10 -1: - ldd [%o1 + 0x08], %f12 - ldd [%o1 + 0x10], %f14 - ldd [%o1 + 0x18], %f16 - ldd [%o1 + 0x20], %f18 - ldd [%o1 + 0x28], %f20 - ldd [%o1 + 0x30], %f22 - ldd [%o1 + 0x38], %f24 - ldd [%o1 + 0x40], %f26 - - faligndata %f10, %f12, %f8 - faligndata %f12, %f14, %f10 - faligndata %f14, %f16, %f12 - faligndata %f16, %f18, %f14 - faligndata %f18, %f20, %f16 - faligndata %f20, %f22, %f18 - faligndata %f22, %f24, %f20 - faligndata %f24, %f26, %f22 - - SHA1 - - subcc %o2, 1, %o2 - fsrc2 %f26, %f10 - bne,pt %xcc, 1b - add %o1, 0x40, %o1 - - ba,a,pt %xcc, 5b -ENDPROC(sha1_sparc64_transform) diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c deleted file mode 100644 index ef19d5023b1b..000000000000 --- a/arch/sparc/crypto/sha1_glue.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Glue code for SHA1 hashing optimized for sparc64 crypto opcodes. - * - * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void sha1_sparc64_transform(struct sha1_state *digest, - const u8 *data, int rounds); - -static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_base_do_update_blocks(desc, data, len, - sha1_sparc64_transform); -} - -/* Add padding and return the message digest. */ -static int sha1_sparc64_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - sha1_base_do_finup(desc, src, len, sha1_sparc64_transform); - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_sparc64_update, - .finup = sha1_sparc64_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool __init sparc64_has_sha1_opcode(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return false; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_SHA1)) - return false; - - return true; -} - -static int __init sha1_sparc64_mod_init(void) -{ - if (sparc64_has_sha1_opcode()) { - pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n"); - return crypto_register_shash(&alg); - } - pr_info("sparc64 sha1 opcode not available.\n"); - return -ENODEV; -} - -static void __exit sha1_sparc64_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(sha1_sparc64_mod_init); -module_exit(sha1_sparc64_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, sparc64 sha1 opcode accelerated"); - -MODULE_ALIAS_CRYPTO("sha1"); - -#include "crop_devid.c" diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 7da0432d8164..019034f2bb53 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -151,6 +151,7 @@ config CRYPTO_LIB_SHA1_ARCH default y if MIPS && CPU_CAVIUM_OCTEON default y if PPC default y if S390 + default y if SPARC64 config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 02f672562928..7897da2de623 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -81,6 +81,7 @@ ifeq ($(CONFIG_PPC),y) libsha1-y += powerpc/sha1-powerpc-asm.o libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o endif +libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h new file mode 100644 index 000000000000..5015f93584b7 --- /dev/null +++ b/lib/crypto/sparc/sha1.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SHA-1 accelerated using the sparc64 crypto opcodes + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes); + +asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha1_opcodes)) + sha1_sparc64_transform(state, data, nblocks); + else + sha1_blocks_generic(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA1)) + return; + + static_branch_enable(&have_sha1_opcodes); + pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n"); +} diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S new file mode 100644 index 000000000000..00b46bac1b08 --- /dev/null +++ b/lib/crypto/sparc/sha1_asm.S @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +ENTRY(sha1_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x0c], %f3 + bne,pn %xcc, 10f + ld [%o0 + 0x10], %f4 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA1 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA1 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha1_sparc64_transform) -- cgit v1.2.3 From f3d6cb3dc0394b866bc0d1e15157ce45844cf3d3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:04 -0700 Subject: lib/crypto: x86/sha1: Migrate optimized code into library Instead of exposing the x86-optimized SHA-1 code via x86-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be x86-optimized, and it fixes the longstanding issue where the x86-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha1_blocks(), change the type of the nblocks parameter of the assembly functions from int to size_t. The assembly functions actually already treated it as size_t. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-14-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/crypto/Kconfig | 14 - arch/x86/crypto/Makefile | 3 - arch/x86/crypto/sha1_avx2_x86_64_asm.S | 700 --------------------------------- arch/x86/crypto/sha1_ni_asm.S | 304 -------------- arch/x86/crypto/sha1_ssse3_asm.S | 554 -------------------------- arch/x86/crypto/sha1_ssse3_glue.c | 324 --------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 3 + lib/crypto/x86/sha1-avx2-asm.S | 697 ++++++++++++++++++++++++++++++++ lib/crypto/x86/sha1-ni-asm.S | 299 ++++++++++++++ lib/crypto/x86/sha1-ssse3-and-avx.S | 551 ++++++++++++++++++++++++++ lib/crypto/x86/sha1.h | 74 ++++ 12 files changed, 1625 insertions(+), 1899 deletions(-) delete mode 100644 arch/x86/crypto/sha1_avx2_x86_64_asm.S delete mode 100644 arch/x86/crypto/sha1_ni_asm.S delete mode 100644 arch/x86/crypto/sha1_ssse3_asm.S delete mode 100644 arch/x86/crypto/sha1_ssse3_glue.c create mode 100644 lib/crypto/x86/sha1-avx2-asm.S create mode 100644 lib/crypto/x86/sha1-ni-asm.S create mode 100644 lib/crypto/x86/sha1-ssse3-and-avx.S create mode 100644 lib/crypto/x86/sha1.h diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index eb641a300154..94016c60561e 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -376,20 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_SHA1_SSSE3 - tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on 64BIT - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d31348be8370..d402963d6b57 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -51,9 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif -obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S deleted file mode 100644 index 4b49bdc95265..000000000000 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Implement fast SHA-1 with AVX2 instructions. (x86_64) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Ilya Albrekht - * Maxim Locktyukhin - * Ronen Zohar - * Chandramouli Narayanan - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. - * - *This implementation is based on the previous SSSE3 release: - *Visit http://software.intel.com/en-us/articles/ - *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ - * - *Updates 20-byte SHA-1 record at start of 'state', from 'input', for - *even number of 'blocks' consecutive 64-byte blocks. - * - *extern "C" void sha1_transform_avx2( - * struct sha1_state *state, const u8* input, int blocks ); - */ - -#include - -#define CTX %rdi /* arg1 */ -#define BUF %rsi /* arg2 */ -#define CNT %rdx /* arg3 */ - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %eax -#define REG_E %edx -#define REG_TB %ebx -#define REG_TA %r12d -#define REG_RA %rcx -#define REG_RB %rsi -#define REG_RC %rdi -#define REG_RD %rax -#define REG_RE %rdx -#define REG_RTA %r12 -#define REG_RTB %rbx -#define REG_T1 %r11d -#define xmm_mov vmovups -#define avx2_zeroupper vzeroupper -#define RND_F1 1 -#define RND_F2 2 -#define RND_F3 3 - -.macro REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set TB, REG_TB - .set TA, REG_TA - - .set RA, REG_RA - .set RB, REG_RB - .set RC, REG_RC - .set RD, REG_RD - .set RE, REG_RE - - .set RTA, REG_RTA - .set RTB, REG_RTB - - .set T1, REG_T1 -.endm - -#define HASH_PTR %r9 -#define BLOCKS_CTR %r8 -#define BUFFER_PTR %r10 -#define BUFFER_PTR2 %r13 - -#define PRECALC_BUF %r14 -#define WK_BUF %r15 - -#define W_TMP %xmm0 -#define WY_TMP %ymm0 -#define WY_TMP2 %ymm9 - -# AVX2 variables -#define WY0 %ymm3 -#define WY4 %ymm5 -#define WY08 %ymm7 -#define WY12 %ymm8 -#define WY16 %ymm12 -#define WY20 %ymm13 -#define WY24 %ymm14 -#define WY28 %ymm15 - -#define YMM_SHUFB_BSWAP %ymm10 - -/* - * Keep 2 iterations precalculated at a time: - * - 80 DWORDs per iteration * 2 - */ -#define W_SIZE (80*2*2 +16) - -#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) -#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) - - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -.macro PRECALC_RESET_WY - .set WY_00, WY0 - .set WY_04, WY4 - .set WY_08, WY08 - .set WY_12, WY12 - .set WY_16, WY16 - .set WY_20, WY20 - .set WY_24, WY24 - .set WY_28, WY28 - .set WY_32, WY_00 -.endm - -.macro PRECALC_ROTATE_WY - /* Rotate macros */ - .set WY_32, WY_28 - .set WY_28, WY_24 - .set WY_24, WY_20 - .set WY_20, WY_16 - .set WY_16, WY_12 - .set WY_12, WY_08 - .set WY_08, WY_04 - .set WY_04, WY_00 - .set WY_00, WY_32 - - /* Define register aliases */ - .set WY, WY_00 - .set WY_minus_04, WY_04 - .set WY_minus_08, WY_08 - .set WY_minus_12, WY_12 - .set WY_minus_16, WY_16 - .set WY_minus_20, WY_20 - .set WY_minus_24, WY_24 - .set WY_minus_28, WY_28 - .set WY_minus_32, WY -.endm - -.macro PRECALC_00_15 - .if (i == 0) # Initialize and rotate registers - PRECALC_RESET_WY - PRECALC_ROTATE_WY - .endif - - /* message scheduling pre-compute for rounds 0-15 */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vmovdqu (i * 2)(BUFFER_PTR), W_TMP - .elseif ((i & 7) == 1) - vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ - WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY - .elseif ((i & 7) == 4) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - .elseif ((i & 7) == 7) - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_16_31 - /* - * message scheduling pre-compute for rounds 16-31 - * calculating last 32 w[i] values in 8 XMM registers - * pre-calculate K+w[i] values and store to mem - * for later load by ALU add instruction - * - * "brute force" vectorization for rounds 16-31 only - * due to w[i]->w[i-3] dependency - */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - /* w[i-14] */ - vpalignr $8, WY_minus_16, WY_minus_12, WY - vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ - .elseif ((i & 7) == 1) - vpxor WY_minus_08, WY, WY - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpxor WY_TMP, WY, WY - vpslldq $12, WY, WY_TMP2 - .elseif ((i & 7) == 3) - vpslld $1, WY, WY_TMP - vpsrld $31, WY, WY - .elseif ((i & 7) == 4) - vpor WY, WY_TMP, WY_TMP - vpslld $2, WY_TMP2, WY - .elseif ((i & 7) == 5) - vpsrld $30, WY_TMP2, WY_TMP2 - vpxor WY, WY_TMP, WY_TMP - .elseif ((i & 7) == 7) - vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_32_79 - /* - * in SHA-1 specification: - * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: - * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization - * since w[i]=>w[i-3] dependency is broken - */ - - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP - .elseif ((i & 7) == 1) - /* W is W_minus_32 before xor */ - vpxor WY_minus_28, WY, WY - .elseif ((i & 7) == 2) - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 3) - vpxor WY_TMP, WY, WY - .elseif ((i & 7) == 4) - vpslld $2, WY, WY_TMP - .elseif ((i & 7) == 5) - vpsrld $30, WY, WY - vpor WY, WY_TMP, WY - .elseif ((i & 7) == 7) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC r, s - .set i, \r - - .if (i < 40) - .set K_XMM, 32*0 - .elseif (i < 80) - .set K_XMM, 32*1 - .elseif (i < 120) - .set K_XMM, 32*2 - .else - .set K_XMM, 32*3 - .endif - - .if (i<32) - PRECALC_00_15 \s - .elseif (i<64) - PRECALC_16_31 \s - .elseif (i < 160) - PRECALC_32_79 \s - .endif -.endm - -.macro ROTATE_STATE - .set T_REG, E - .set E, D - .set D, C - .set C, B - .set B, TB - .set TB, A - .set A, T_REG - - .set T_REG, RE - .set RE, RD - .set RD, RC - .set RC, RB - .set RB, RTB - .set RTB, RA - .set RA, T_REG -.endm - -/* Macro relies on saved ROUND_Fx */ - -.macro RND_FUN f, r - .if (\f == RND_F1) - ROUND_F1 \r - .elseif (\f == RND_F2) - ROUND_F2 \r - .elseif (\f == RND_F3) - ROUND_F3 \r - .endif -.endm - -.macro RR r - .set round_id, (\r % 80) - - .if (round_id == 0) /* Precalculate F for first round */ - .set ROUND_FUNC, RND_F1 - mov B, TB - - rorx $(32-30), B, B /* b>>>2 */ - andn D, TB, T1 - and C, TB - xor T1, TB - .endif - - RND_FUN ROUND_FUNC, \r - ROTATE_STATE - - .if (round_id == 18) - .set ROUND_FUNC, RND_F2 - .elseif (round_id == 38) - .set ROUND_FUNC, RND_F3 - .elseif (round_id == 58) - .set ROUND_FUNC, RND_F2 - .endif - - .set round_id, ( (\r+1) % 80) - - RND_FUN ROUND_FUNC, (\r+1) - ROTATE_STATE -.endm - -.macro ROUND_F1 r - add WK(\r), E - - andn C, A, T1 /* ~b&d */ - lea (RE,RTB), E /* Add F from the previous round */ - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30),A, TB /* b>>>2 for next round */ - - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - /* - * Calculate F for the next round - * (b & c) ^ andn[b, d] - */ - and B, A /* b&c */ - xor T1, A /* F1 = (b&c) ^ (~b&d) */ - - lea (RE,RTA), E /* E += A >>> 5 */ -.endm - -.macro ROUND_F2 r - add WK(\r), E - lea (RE,RTB), E /* Add F from the previous round */ - - /* Calculate F for the next round */ - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - .if ((round_id) < 79) - rorx $(32-30), A, TB /* b>>>2 for next round */ - .endif - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - .if ((round_id) < 79) - xor B, A - .endif - - add TA, E /* E += A >>> 5 */ - - .if ((round_id) < 79) - xor C, A - .endif -.endm - -.macro ROUND_F3 r - add WK(\r), E - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - lea (RE,RTB), E /* Add F from the previous round */ - - mov B, T1 - or A, T1 - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30), A, TB /* b>>>2 for next round */ - - /* Calculate F for the next round - * (b and c) or (d and (b or c)) - */ - and C, T1 - and B, A - or T1, A - - add TA, E /* E += A >>> 5 */ - -.endm - -/* Add constant only if (%2 > %3) condition met (uses RTA as temp) - * %1 + %2 >= %3 ? %4 : 0 - */ -.macro ADD_IF_GE a, b, c, d - mov \a, RTA - add $\d, RTA - cmp $\c, \b - cmovge RTA, \a -.endm - -/* - * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining - */ -.macro SHA1_PIPELINED_MAIN_BODY - - REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - mov %rsp, PRECALC_BUF - lea (2*4*80+32)(%rsp), WK_BUF - - # Precalc WK for first 2 blocks - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 - .set i, 0 - .rept 160 - PRECALC i - .set i, i + 1 - .endr - - /* Go to next block if needed */ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - xchg WK_BUF, PRECALC_BUF - - .align 32 -.L_loop: - /* - * code loops through more than one block - * we use K_BASE value as a signal of a last block, - * it is set below by: cmovae BUFFER_PTR, K_BASE - */ - test BLOCKS_CTR, BLOCKS_CTR - jnz .L_begin - .align 32 - jmp .L_end - .align 32 -.L_begin: - - /* - * Do first block - * rounds: 0,2,4,6,8 - */ - .set j, 0 - .rept 5 - RR j - .set j, j+2 - .endr - - /* - * rounds: - * 10,12,14,16,18 - * 20,22,24,26,28 - * 30,32,34,36,38 - * 40,42,44,46,48 - * 50,52,54,56,58 - */ - .rept 25 - RR j - .set j, j+2 - .endr - - /* Update Counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 - /* - * rounds - * 60,62,64,66,68 - * 70,72,74,76,78 - */ - .rept 10 - RR j - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - test BLOCKS_CTR, BLOCKS_CTR - jz .L_loop - - mov TB, B - - /* Process second block */ - /* - * rounds - * 0+80, 2+80, 4+80, 6+80, 8+80 - * 10+80,12+80,14+80,16+80,18+80 - */ - - .set j, 0 - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 20+80,22+80,24+80,26+80,28+80 - * 30+80,32+80,34+80,36+80,38+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 40+80,42+80,44+80,46+80,48+80 - * 50+80,52+80,54+80,56+80,58+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* update counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - - /* - * rounds - * 60+80,62+80,64+80,66+80,68+80 - * 70+80,72+80,74+80,76+80,78+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - /* Reset state for AVX2 reg permutation */ - mov A, TA - mov TB, A - mov C, TB - mov E, C - mov D, B - mov TA, D - - REGALLOC - - xchg WK_BUF, PRECALC_BUF - - jmp .L_loop - - .align 32 -.L_end: - -.endm -/* - * macro implements SHA-1 function's body for several 64-byte blocks - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) - - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - RESERVE_STACK = (W_SIZE*4 + 8+24) - - /* Align stack */ - push %rbp - mov %rsp, %rbp - and $~(0x20-1), %rsp - sub $RESERVE_STACK, %rsp - - avx2_zeroupper - - /* Setup initial values */ - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - mov BUF, BUFFER_PTR2 - mov CNT, BLOCKS_CTR - - xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - avx2_zeroupper - - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - RET - - SYM_FUNC_END(\name) -.endm - -.section .rodata - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.align 128 -K_XMM_AR: - .long K1, K1, K1, K1 - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f -.text - -SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S deleted file mode 100644 index cade913d4882..000000000000 --- a/arch/x86/crypto/sha1_ni_asm.S +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley - * Tim Chen - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -/* gcc conversion */ -#define FRAME_SIZE 32 /* space for 2x16 bytes */ - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define SHUF_MASK %xmm7 - - -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha1_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks) - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ -.text -SYM_TYPED_FUNC_START(sha1_ni_transform) - push %rbp - mov %rsp, %rbp - sub $FRAME_SIZE, %rsp - and $~0xF, %rsp - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* load initial hash values */ - pinsrd $3, 1*16(DIGEST_PTR), E0 - movdqu 0*16(DIGEST_PTR), ABCD - pand UPPER_WORD_MASK(%rip), E0 - pshufd $0x1B, ABCD, ABCD - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa E0, (0*16)(%rsp) - movdqa ABCD, (1*16)(%rsp) - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG0 - pshufb SHUF_MASK, MSG0 - paddd MSG0, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG1 - pshufb SHUF_MASK, MSG1 - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG2 - pshufb SHUF_MASK, MSG2 - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG3 - pshufb SHUF_MASK, MSG3 - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - pxor MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte (0*16)(%rsp), E0 - paddd (1*16)(%rsp), ABCD - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, ABCD, ABCD - movdqu ABCD, 0*16(DIGEST_PTR) - pextrd $3, E0, 1*16(DIGEST_PTR) - -.Ldone_hash: - mov %rbp, %rsp - pop %rbp - - RET -SYM_FUNC_END(sha1_ni_transform) - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 -.align 16 -UPPER_WORD_MASK: - .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S deleted file mode 100644 index f54988c80eb4..000000000000 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ /dev/null @@ -1,554 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental - * SSE3 instruction set extensions introduced in Intel Core Microarchitecture - * processors. CPUs supporting Intel(R) AVX extensions will get an additional - * boost. - * - * This work was inspired by the vectorized implementation of Dean Gaudet. - * Additional information on it can be found at: - * http://www.arctic.org/~dean/crypto/sha1.html - * - * It was improved upon with more efficient vectorization of the message - * scheduling. This implementation has also been optimized for all current and - * several future generations of Intel CPUs. - * - * See this article for more information about the implementation details: - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ - * - * Copyright (C) 2010, Intel Corp. - * Authors: Maxim Locktyukhin - * Ronen Zohar - * - * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: - * Author: Mathias Krause - */ - -#include -#include - -#define CTX %rdi // arg1 -#define BUF %rsi // arg2 -#define CNT %rdx // arg3 - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %r12d -#define REG_E %edx - -#define REG_T1 %eax -#define REG_T2 %ebx - -#define K_BASE %r8 -#define HASH_PTR %r9 -#define BUFFER_PTR %r10 -#define BUFFER_END %r11 - -#define W_TMP1 %xmm0 -#define W_TMP2 %xmm9 - -#define W0 %xmm1 -#define W4 %xmm2 -#define W8 %xmm3 -#define W12 %xmm4 -#define W16 %xmm5 -#define W20 %xmm6 -#define W24 %xmm7 -#define W28 %xmm8 - -#define XMM_SHUFB_BSWAP %xmm10 - -/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ -#define WK(t) (((t) & 15) * 4)(%rsp) -#define W_PRECALC_AHEAD 16 - -/* - * This macro implements the SHA-1 function's body for single 64-byte block - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_TYPED_FUNC_START(\name) - - push %rbx - push %r12 - push %rbp - mov %rsp, %rbp - - sub $64, %rsp # allocate workspace - and $~15, %rsp # align stack - - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - shl $6, CNT # multiply by 64 - add BUF, CNT - mov CNT, BUFFER_END - - lea K_XMM_AR(%rip), K_BASE - xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - # cleanup workspace - mov $8, %ecx - mov %rsp, %rdi - xor %eax, %eax - rep stosq - - mov %rbp, %rsp # deallocate workspace - pop %rbp - pop %r12 - pop %rbx - RET - - SYM_FUNC_END(\name) -.endm - -/* - * This macro implements 80 rounds of SHA-1 for one 64-byte block - */ -.macro SHA1_PIPELINED_MAIN_BODY - INIT_REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - .set i, 0 - .rept W_PRECALC_AHEAD - W_PRECALC i - .set i, (i+1) - .endr - -.align 4 -1: - RR F1,A,B,C,D,E,0 - RR F1,D,E,A,B,C,2 - RR F1,B,C,D,E,A,4 - RR F1,E,A,B,C,D,6 - RR F1,C,D,E,A,B,8 - - RR F1,A,B,C,D,E,10 - RR F1,D,E,A,B,C,12 - RR F1,B,C,D,E,A,14 - RR F1,E,A,B,C,D,16 - RR F1,C,D,E,A,B,18 - - RR F2,A,B,C,D,E,20 - RR F2,D,E,A,B,C,22 - RR F2,B,C,D,E,A,24 - RR F2,E,A,B,C,D,26 - RR F2,C,D,E,A,B,28 - - RR F2,A,B,C,D,E,30 - RR F2,D,E,A,B,C,32 - RR F2,B,C,D,E,A,34 - RR F2,E,A,B,C,D,36 - RR F2,C,D,E,A,B,38 - - RR F3,A,B,C,D,E,40 - RR F3,D,E,A,B,C,42 - RR F3,B,C,D,E,A,44 - RR F3,E,A,B,C,D,46 - RR F3,C,D,E,A,B,48 - - RR F3,A,B,C,D,E,50 - RR F3,D,E,A,B,C,52 - RR F3,B,C,D,E,A,54 - RR F3,E,A,B,C,D,56 - RR F3,C,D,E,A,B,58 - - add $64, BUFFER_PTR # move to the next 64-byte block - cmp BUFFER_END, BUFFER_PTR # if the current is the last one use - cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun - - RR F4,A,B,C,D,E,60 - RR F4,D,E,A,B,C,62 - RR F4,B,C,D,E,A,64 - RR F4,E,A,B,C,D,66 - RR F4,C,D,E,A,B,68 - - RR F4,A,B,C,D,E,70 - RR F4,D,E,A,B,C,72 - RR F4,B,C,D,E,A,74 - RR F4,E,A,B,C,D,76 - RR F4,C,D,E,A,B,78 - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), B - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - RESTORE_RENAMED_REGS - cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end - jne 1b -.endm - -.macro INIT_REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set T1, REG_T1 - .set T2, REG_T2 -.endm - -.macro RESTORE_RENAMED_REGS - # order is important (REG_C is where it should be) - mov B, REG_B - mov D, REG_D - mov A, REG_A - mov E, REG_E -.endm - -.macro SWAP_REG_NAMES a, b - .set _T, \a - .set \a, \b - .set \b, _T -.endm - -.macro F1 b, c, d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - xor \d, T1 - and \b, T1 - xor \d, T1 -.endm - -.macro F2 b, c, d - mov \d, T1 - SWAP_REG_NAMES \d, T1 - xor \c, T1 - xor \b, T1 -.endm - -.macro F3 b, c ,d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - mov \b, T2 - or \b, T1 - and \c, T2 - and \d, T1 - or T2, T1 -.endm - -.macro F4 b, c, d - F2 \b, \c, \d -.endm - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -/* - * RR does two rounds of SHA-1 back to back with W[] pre-calc - * t1 = F(b, c, d); e += w(i) - * e += t1; b <<= 30; d += w(i+1); - * t1 = F(a, b, c); - * d += t1; a <<= 5; - * e += a; - * t1 = e; a >>= 7; - * t1 <<= 5; - * d += t1; - */ -.macro RR F, a, b, c, d, e, round - add WK(\round), \e - \F \b, \c, \d # t1 = F(b, c, d); - W_PRECALC (\round + W_PRECALC_AHEAD) - rol $30, \b - add T1, \e - add WK(\round + 1), \d - - \F \a, \b, \c - W_PRECALC (\round + W_PRECALC_AHEAD + 1) - rol $5, \a - add \a, \e - add T1, \d - ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) - .set i, ((\r) % 80) # pre-compute for the next iteration - .if (i == 0) - W_PRECALC_RESET - .endif - W_PRECALC_00_15 - .elseif (i<32) - W_PRECALC_16_31 - .elseif (i < 80) // rounds 32-79 - W_PRECALC_32_79 - .endif -.endm - -.macro W_PRECALC_RESET - .set W, W0 - .set W_minus_04, W4 - .set W_minus_08, W8 - .set W_minus_12, W12 - .set W_minus_16, W16 - .set W_minus_20, W20 - .set W_minus_24, W24 - .set W_minus_28, W28 - .set W_minus_32, W -.endm - -.macro W_PRECALC_ROTATE - .set W_minus_32, W_minus_28 - .set W_minus_28, W_minus_24 - .set W_minus_24, W_minus_20 - .set W_minus_20, W_minus_16 - .set W_minus_16, W_minus_12 - .set W_minus_12, W_minus_08 - .set W_minus_08, W_minus_04 - .set W_minus_04, W - .set W, W_minus_32 -.endm - -.macro W_PRECALC_SSSE3 - -.macro W_PRECALC_00_15 - W_PRECALC_00_15_SSSE3 -.endm -.macro W_PRECALC_16_31 - W_PRECALC_16_31_SSSE3 -.endm -.macro W_PRECALC_32_79 - W_PRECALC_32_79_SSSE3 -.endm - -/* message scheduling pre-compute for rounds 0-15 */ -.macro W_PRECALC_00_15_SSSE3 - .if ((i & 3) == 0) - movdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - pshufb XMM_SHUFB_BSWAP, W_TMP1 - movdqa W_TMP1, W - .elseif ((i & 3) == 2) - paddd (K_BASE), W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 16-31 - * - * - calculating last 32 w[i] values in 8 XMM registers - * - pre-calculate K+w[i] values and store to mem, for later load by ALU add - * instruction - * - * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] - * dependency, but improves for 32-79 - */ -.macro W_PRECALC_16_31_SSSE3 - # blended scheduling of vector and scalar instruction streams, one 4-wide - # vector iteration / 4 scalar rounds - .if ((i & 3) == 0) - movdqa W_minus_12, W - palignr $8, W_minus_16, W # w[i-14] - movdqa W_minus_04, W_TMP1 - psrldq $4, W_TMP1 # w[i-3] - pxor W_minus_08, W - .elseif ((i & 3) == 1) - pxor W_minus_16, W_TMP1 - pxor W_TMP1, W - movdqa W, W_TMP2 - movdqa W, W_TMP1 - pslldq $12, W_TMP2 - .elseif ((i & 3) == 2) - psrld $31, W - pslld $1, W_TMP1 - por W, W_TMP1 - movdqa W_TMP2, W - psrld $30, W_TMP2 - pslld $2, W - .elseif ((i & 3) == 3) - pxor W, W_TMP1 - pxor W_TMP2, W_TMP1 - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 32-79 - * - * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken - */ -.macro W_PRECALC_32_79_SSSE3 - .if ((i & 3) == 0) - movdqa W_minus_04, W_TMP1 - pxor W_minus_28, W # W is W_minus_32 before xor - palignr $8, W_minus_08, W_TMP1 - .elseif ((i & 3) == 1) - pxor W_minus_16, W - pxor W_TMP1, W - movdqa W, W_TMP1 - .elseif ((i & 3) == 2) - psrld $30, W - pslld $2, W_TMP1 - por W, W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_SSSE3 - - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.section .rodata -.align 16 - -K_XMM_AR: - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - - -.section .text - -W_PRECALC_SSSE3 -.macro xmm_mov a, b - movdqu \a,\b -.endm - -/* - * SSSE3 optimized implementation: - * - * extern "C" void sha1_transform_ssse3(struct sha1_state *state, - * const u8 *data, int blocks); - * - * Note that struct sha1_state is assumed to begin with u32 state[5]. - */ -SHA1_VECTOR_ASM sha1_transform_ssse3 - -.macro W_PRECALC_AVX - -.purgem W_PRECALC_00_15 -.macro W_PRECALC_00_15 - W_PRECALC_00_15_AVX -.endm -.purgem W_PRECALC_16_31 -.macro W_PRECALC_16_31 - W_PRECALC_16_31_AVX -.endm -.purgem W_PRECALC_32_79 -.macro W_PRECALC_32_79 - W_PRECALC_32_79_AVX -.endm - -.macro W_PRECALC_00_15_AVX - .if ((i & 3) == 0) - vmovdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - vpshufb XMM_SHUFB_BSWAP, W_TMP1, W - .elseif ((i & 3) == 2) - vpaddd (K_BASE), W, W_TMP1 - .elseif ((i & 3) == 3) - vmovdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_16_31_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] - vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] - vpxor W_minus_08, W, W - vpxor W_minus_16, W_TMP1, W_TMP1 - .elseif ((i & 3) == 1) - vpxor W_TMP1, W, W - vpslldq $12, W, W_TMP2 - vpslld $1, W, W_TMP1 - .elseif ((i & 3) == 2) - vpsrld $31, W, W - vpor W, W_TMP1, W_TMP1 - vpslld $2, W_TMP2, W - vpsrld $30, W_TMP2, W_TMP2 - .elseif ((i & 3) == 3) - vpxor W, W_TMP1, W_TMP1 - vpxor W_TMP2, W_TMP1, W - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_32_79_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_08, W_minus_04, W_TMP1 - vpxor W_minus_28, W, W # W is W_minus_32 before xor - .elseif ((i & 3) == 1) - vpxor W_minus_16, W_TMP1, W_TMP1 - vpxor W_TMP1, W, W - .elseif ((i & 3) == 2) - vpslld $2, W, W_TMP1 - vpsrld $30, W, W - vpor W, W_TMP1, W - .elseif ((i & 3) == 3) - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_AVX - -W_PRECALC_AVX -.purgem xmm_mov -.macro xmm_mov a, b - vmovdqu \a,\b -.endm - - -/* AVX optimized implementation: - * extern "C" void sha1_transform_avx(struct sha1_state *state, - * const u8 *data, int blocks); - */ -SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c deleted file mode 100644 index 826579a7473c..000000000000 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * Glue code for the SHA1 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha1_generic.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - * Copyright (c) Chandramouli Narayanan - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include - -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) -{ - int remain; - - /* - * Make sure struct sha1_state begins directly with the SHA1 - * 160-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - kernel_fpu_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return remain; -} - -static inline int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, - sha1_block_fn *sha1_xform) -{ - kernel_fpu_begin(); - sha1_base_do_finup(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return sha1_base_finish(desc, out); -} - -asmlinkage void sha1_transform_ssse3(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_ssse3); -} - -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_ssse3); -} - -static struct shash_alg sha1_ssse3_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ssse3_update, - .finup = sha1_ssse3_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shash(&sha1_ssse3_alg); - return 0; -} - -static void unregister_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shash(&sha1_ssse3_alg); -} - -asmlinkage void sha1_transform_avx(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_avx); -} - -static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_avx); -} - -static struct shash_alg sha1_avx_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx_update, - .finup = sha1_avx_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha1_avx(void) -{ - if (avx_usable()) - return crypto_register_shash(&sha1_avx_alg); - return 0; -} - -static void unregister_sha1_avx(void) -{ - if (avx_usable()) - crypto_unregister_shash(&sha1_avx_alg); -} - -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks); - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) - && boot_cpu_has(X86_FEATURE_BMI1) - && boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static inline void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) -{ - /* Select the optimal transform based on data block size */ - if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(state, data, blocks); - else - sha1_transform_avx(state, data, blocks); -} - -static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2); -} - -static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); -} - -static struct shash_alg sha1_avx2_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx2_update, - .finup = sha1_avx2_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shash(&sha1_avx2_alg); - return 0; -} - -static void unregister_sha1_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shash(&sha1_avx2_alg); -} - -asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, - int rounds); - -static int sha1_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_ni_transform); -} - -static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_ni_transform); -} - -static struct shash_alg sha1_ni_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ni_update, - .finup = sha1_ni_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ni", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shash(&sha1_ni_alg); - return 0; -} - -static void unregister_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shash(&sha1_ni_alg); -} - -static int __init sha1_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha1_ssse3()) - goto fail; - - if (register_sha1_avx()) { - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_avx2()) { - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_ni()) { - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha1_ssse3_mod_fini(void) -{ - unregister_sha1_ni(); - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); -} - -module_init(sha1_ssse3_mod_init); -module_exit(sha1_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-ssse3"); -MODULE_ALIAS_CRYPTO("sha1-avx"); -MODULE_ALIAS_CRYPTO("sha1-avx2"); -MODULE_ALIAS_CRYPTO("sha1-ni"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 019034f2bb53..dc34441a80bd 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -152,6 +152,7 @@ config CRYPTO_LIB_SHA1_ARCH default y if PPC default y if S390 default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 7897da2de623..b09fdb7f4b4c 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -82,6 +82,9 @@ libsha1-y += powerpc/sha1-powerpc-asm.o libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o endif libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o +libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \ + x86/sha1-avx2-asm.o \ + x86/sha1-ni-asm.o endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S new file mode 100644 index 000000000000..91b2dc6db179 --- /dev/null +++ b/lib/crypto/x86/sha1-avx2-asm.S @@ -0,0 +1,697 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht + * Maxim Locktyukhin + * Ronen Zohar + * Chandramouli Narayanan + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * void sha1_transform_avx2(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ + +#include + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %r11d +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define HASH_PTR %r9 +#define BLOCKS_CTR %r8 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu (i * 2)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +.L_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + test BLOCKS_CTR, BLOCKS_CTR + jnz .L_begin + .align 32 + jmp .L_end + .align 32 +.L_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + test BLOCKS_CTR, BLOCKS_CTR + jz .L_loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp .L_loop + + .align 32 +.L_end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + push %rbp + mov %rsp, %rbp + and $~(0x20-1), %rsp + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + /* Setup initial values */ + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + mov %rbp, %rsp + pop %rbp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET + + SYM_FUNC_END(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S new file mode 100644 index 000000000000..3989b0642ff5 --- /dev/null +++ b/lib/crypto/x86/sha1-ni-asm.S @@ -0,0 +1,299 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +/* gcc conversion */ +#define FRAME_SIZE 32 /* space for 2x16 bytes */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 block function + * + * This function takes a pointer to the current SHA-1 state, a pointer to the + * input data, and the number of 64-byte blocks to process. Once all blocks + * have been processed, the state is updated with the new state. This function + * only processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization are expected to be handled elsewhere. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha1_ni_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks) + */ +.text +SYM_FUNC_START(sha1_ni_transform) + push %rbp + mov %rsp, %rbp + sub $FRAME_SIZE, %rsp + and $~0xF, %rsp + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* load initial hash values */ + pinsrd $3, 1*16(DIGEST_PTR), E0 + movdqu 0*16(DIGEST_PTR), ABCD + pand UPPER_WORD_MASK(%rip), E0 + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa E0, (0*16)(%rsp) + movdqa ABCD, (1*16)(%rsp) + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + pxor MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte (0*16)(%rsp), E0 + paddd (1*16)(%rsp), ABCD + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, 0*16(DIGEST_PTR) + pextrd $3, E0, 1*16(DIGEST_PTR) + +.Ldone_hash: + mov %rbp, %rsp + pop %rbp + + RET +SYM_FUNC_END(sha1_ni_transform) + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 +.align 16 +UPPER_WORD_MASK: + .octa 0xFFFFFFFF000000000000000000000000 diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S new file mode 100644 index 000000000000..78b48cb09c5b --- /dev/null +++ b/lib/crypto/x86/sha1-ssse3-and-avx.S @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin + * Ronen Zohar + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause + */ + +#include + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %r12d +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %rbp + mov %rsp, %rbp + + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %eax, %eax + rep stosq + + mov %rbp, %rsp # deallocate workspace + pop %rbp + pop %r12 + pop %rbx + RET + + SYM_FUNC_END(\name) +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* + * SSSE3 optimized implementation: + * + * void sha1_transform_ssse3(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * void sha1_transform_avx(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_avx diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h new file mode 100644 index 000000000000..e308379d89bc --- /dev/null +++ b/lib/crypto/x86/sha1.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include +#include + +DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic); + +#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks) \ + { \ + if (likely(irq_fpu_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha1_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3); +DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx); +DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform); + +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +static void sha1_blocks_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (likely(irq_fpu_usable())) { + kernel_fpu_begin(); + /* Select the optimal transform based on the number of blocks */ + if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(state, data, nblocks); + else + sha1_transform_avx(state, data, nblocks); + kernel_fpu_end(); + } else { + sha1_blocks_generic(state, data, nblocks); + } +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha1_blocks_x86)(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ni); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha1_blocks_x86, sha1_blocks_avx2); + else + static_call_update(sha1_blocks_x86, sha1_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ssse3); + } +} -- cgit v1.2.3 From c76ed8790b3018fe36647d9aae96e0373f321184 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:23:05 -0700 Subject: crypto: sha1 - Remove sha1_base.h sha1_base.h is no longer used, so remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-15-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha1_base.h | 82 ---------------------------------------------- 1 file changed, 82 deletions(-) delete mode 100644 include/crypto/sha1_base.h diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h deleted file mode 100644 index 62701d136c79..000000000000 --- a/include/crypto/sha1_base.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha1_base.h - core logic for SHA-1 implementations - * - * Copyright (C) 2015 Linaro Ltd - */ - -#ifndef _CRYPTO_SHA1_BASE_H -#define _CRYPTO_SHA1_BASE_H - -#include -#include -#include -#include -#include -#include - -typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks); - -static inline int sha1_base_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA1_H0; - sctx->state[1] = SHA1_H1; - sctx->state[2] = SHA1_H2; - sctx->state[3] = SHA1_H3; - sctx->state[4] = SHA1_H4; - sctx->count = 0; - - return 0; -} - -static inline int sha1_base_do_update_blocks(struct shash_desc *desc, - const u8 *data, - unsigned int len, - sha1_block_fn *block_fn) -{ - unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE); - struct sha1_state *sctx = shash_desc_ctx(desc); - - sctx->count += len - remain; - block_fn(sctx, data, len / SHA1_BLOCK_SIZE); - return remain; -} - -static inline int sha1_base_do_finup(struct shash_desc *desc, - const u8 *src, unsigned int len, - sha1_block_fn *block_fn) -{ - unsigned int bit_offset = SHA1_BLOCK_SIZE / 8 - 1; - struct sha1_state *sctx = shash_desc_ctx(desc); - union { - __be64 b64[SHA1_BLOCK_SIZE / 4]; - u8 u8[SHA1_BLOCK_SIZE * 2]; - } block = {}; - - if (len >= bit_offset * 8) - bit_offset += SHA1_BLOCK_SIZE / 8; - memcpy(&block, src, len); - block.u8[len] = 0x80; - sctx->count += len; - block.b64[bit_offset] = cpu_to_be64(sctx->count << 3); - block_fn(sctx, block.u8, (bit_offset + 1) * 8 / SHA1_BLOCK_SIZE); - memzero_explicit(&block, sizeof(block)); - - return 0; -} - -static inline int sha1_base_finish(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - __be32 *digest = (__be32 *)out; - int i; - - for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], digest++); - - return 0; -} - -#endif /* _CRYPTO_SHA1_BASE_H */ -- cgit v1.2.3 From f88ed14aa0ef64ff5633605114efe313a0bed84b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Jul 2025 12:18:59 -0700 Subject: lib/crypto: x86/sha1-ni: Minor optimizations and cleanup - Store the previous state in %xmm8-%xmm9 instead of spilling it to the stack. There are plenty of unused XMM registers here, so there is no reason to spill to the stack. (While 32-bit code is limited to %xmm0-%xmm7, this is 64-bit code, so it's free to use %xmm8-%xmm15.) - Remove the unnecessary check for nblocks == 0. sha1_ni_transform() is always passed a positive nblocks. - To get an XMM register with 'e' in the high dword and the rest zeroes, just zeroize the register using pxor, then load 'e'. Previously the code loaded 'e', then zeroized the lower dwords by AND-ing with a constant, which was slightly less efficient. - Instead of computing &DATA_PTR[NBLOCKS << 6] and stopping when DATA_PTR reaches that value, instead just decrement NBLOCKS on each iteration and stop when it reaches 0. This is fewer instructions. - Rename DIGEST_PTR to STATE_PTR. It points to the SHA-1 internal state, not a SHA-1 digest value. This commit shrinks the code size of sha1_ni_transform() from 624 bytes to 589 bytes and also shrinks rodata by 16 bytes. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250718191900.42877-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha1-ni-asm.S | 68 ++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S index 3989b0642ff5..1d08b2f364ce 100644 --- a/lib/crypto/x86/sha1-ni-asm.S +++ b/lib/crypto/x86/sha1-ni-asm.S @@ -55,13 +55,10 @@ #include -#define DIGEST_PTR %rdi /* 1st arg */ +#define STATE_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ #define NUM_BLKS %rdx /* 3rd arg */ -/* gcc conversion */ -#define FRAME_SIZE 32 /* space for 2x16 bytes */ - #define ABCD %xmm0 #define E0 %xmm1 /* Need two E's b/c they ping pong */ #define E1 %xmm2 @@ -70,15 +67,17 @@ #define MSG2 %xmm5 #define MSG3 %xmm6 #define SHUF_MASK %xmm7 - +#define ABCD_SAVED %xmm8 +#define E0_SAVED %xmm9 /* * Intel SHA Extensions optimized implementation of a SHA-1 block function * * This function takes a pointer to the current SHA-1 state, a pointer to the - * input data, and the number of 64-byte blocks to process. Once all blocks - * have been processed, the state is updated with the new state. This function - * only processes complete blocks. State initialization, buffering of partial + * input data, and the number of 64-byte blocks to process. The number of + * blocks to process is assumed to be nonzero. Once all blocks have been + * processed, the state is updated with the new state. This function only + * processes complete blocks. State initialization, buffering of partial * blocks, and digest finalization are expected to be handled elsewhere. * * The indented lines in the loop are instructions related to rounds processing. @@ -89,27 +88,19 @@ */ .text SYM_FUNC_START(sha1_ni_transform) - push %rbp - mov %rsp, %rbp - sub $FRAME_SIZE, %rsp - and $~0xF, %rsp - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* load initial hash values */ - pinsrd $3, 1*16(DIGEST_PTR), E0 - movdqu 0*16(DIGEST_PTR), ABCD - pand UPPER_WORD_MASK(%rip), E0 + + /* Load the initial state from STATE_PTR. */ + pxor E0, E0 + pinsrd $3, 16(STATE_PTR), E0 + movdqu (STATE_PTR), ABCD pshufd $0x1B, ABCD, ABCD movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa E0, (0*16)(%rsp) - movdqa ABCD, (1*16)(%rsp) +.Lnext_block: + /* Save the state for addition after the rounds. */ + movdqa E0, E0_SAVED + movdqa ABCD, ABCD_SAVED /* Rounds 0-3 */ movdqu 0*16(DATA_PTR), MSG0 @@ -267,23 +258,19 @@ SYM_FUNC_START(sha1_ni_transform) movdqa ABCD, E0 sha1rnds4 $3, E1, ABCD - /* Add current hash values with previously saved */ - sha1nexte (0*16)(%rsp), E0 - paddd (1*16)(%rsp), ABCD + /* Add the previous state (before the rounds) to the current state. */ + sha1nexte E0_SAVED, E0 + paddd ABCD_SAVED, ABCD - /* Increment data pointer and loop if more to process */ + /* Advance to the next block, or break if there are no more blocks. */ add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 + dec NUM_BLKS + jnz .Lnext_block - /* Write hash values back in the correct order */ + /* Store the new state to STATE_PTR. */ + pextrd $3, E0, 16(STATE_PTR) pshufd $0x1B, ABCD, ABCD - movdqu ABCD, 0*16(DIGEST_PTR) - pextrd $3, E0, 1*16(DIGEST_PTR) - -.Ldone_hash: - mov %rbp, %rsp - pop %rbp + movdqu ABCD, (STATE_PTR) RET SYM_FUNC_END(sha1_ni_transform) @@ -292,8 +279,3 @@ SYM_FUNC_END(sha1_ni_transform) .align 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f - -.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 -.align 16 -UPPER_WORD_MASK: - .octa 0xFFFFFFFF000000000000000000000000 -- cgit v1.2.3 From 42e3376e0954d7e589e5a53d6149835cad64e8e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Jul 2025 12:19:00 -0700 Subject: lib/crypto: x86/sha1-ni: Convert to use rounds macros The assembly code that does all 80 rounds of SHA-1 is highly repetitive. Replace it with 20 expansions of a macro that does 4 rounds, using the macro arguments and .if directives to handle the slight variations between rounds. This reduces the length of sha1-ni-asm.S by 129 lines while still producing the exact same object file. This mirrors sha256-ni-asm.S which uses this same strategy. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250718191900.42877-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/sha1-ni-asm.S | 187 +++++++------------------------------------ 1 file changed, 29 insertions(+), 158 deletions(-) diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S index 1d08b2f364ce..428f9b960594 100644 --- a/lib/crypto/x86/sha1-ni-asm.S +++ b/lib/crypto/x86/sha1-ni-asm.S @@ -70,6 +70,29 @@ #define ABCD_SAVED %xmm8 #define E0_SAVED %xmm9 +.macro do_4rounds i, m0, m1, m2, m3, e0, e1 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif +.if \i == 0 + paddd \m0, \e0 +.else + sha1nexte \m0, \e0 +.endif + movdqa ABCD, \e1 +.if \i >= 12 && \i < 76 + sha1msg2 \m0, \m1 +.endif + sha1rnds4 $\i / 20, \e0, ABCD +.if \i >= 4 && \i < 68 + sha1msg1 \m0, \m3 +.endif +.if \i >= 8 && \i < 72 + pxor \m0, \m2 +.endif +.endm + /* * Intel SHA Extensions optimized implementation of a SHA-1 block function * @@ -80,9 +103,6 @@ * processes complete blocks. State initialization, buffering of partial * blocks, and digest finalization are expected to be handled elsewhere. * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * * void sha1_ni_transform(struct sha1_block_state *state, * const u8 *data, size_t nblocks) */ @@ -102,161 +122,12 @@ SYM_FUNC_START(sha1_ni_transform) movdqa E0, E0_SAVED movdqa ABCD, ABCD_SAVED - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG0 - pshufb SHUF_MASK, MSG0 - paddd MSG0, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG1 - pshufb SHUF_MASK, MSG1 - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG2 - pshufb SHUF_MASK, MSG2 - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG3 - pshufb SHUF_MASK, MSG3 - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - pxor MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1rnds4 $3, E1, ABCD +.irp i, 0, 16, 32, 48, 64 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3, E0, E1 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0, E1, E0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1, E0, E1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0 +.endr /* Add the previous state (before the rounds) to the current state. */ sha1nexte E0_SAVED, E0 -- cgit v1.2.3 From debc1e5a431779c027a5752f247a4de2e4f702b2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Jul 2025 15:07:06 -0700 Subject: lib/crypto: arm64/sha512-ce: Drop compatibility macros for older binutils Now that the oldest supported binutils version is 2.30, the macros that emit the SHA-512 instructions as '.inst' words are no longer needed. So drop them. No change in the generated machine code. Changed from the original patch by Ard Biesheuvel: (https://lore.kernel.org/r/20250515142702.2592942-2-ardb+git@google.com): - Reduced scope to just SHA-512 - Added comment that explains why "sha3" is used instead of "sha2" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250718220706.475240-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/arm64/sha512-ce-core.S | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S index 7d870a435ea3..22f1ded89bc8 100644 --- a/lib/crypto/arm64/sha512-ce-core.S +++ b/lib/crypto/arm64/sha512-ce-core.S @@ -12,26 +12,17 @@ #include #include - .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 - .set .Lq\b, \b - .set .Lv\b\().2d, \b - .endr - - .macro sha512h, rd, rn, rm - .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512h2, rd, rn, rm - .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512su0, rd, rn - .inst 0xcec08000 | .L\rd | (.L\rn << 5) - .endm - - .macro sha512su1, rd, rn, rm - .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm + /* + * We have to specify the "sha3" feature here, since the GNU and clang + * assemblers both consider the SHA-512 instructions to be part of the + * "sha3" feature. (Except binutils 2.30 through 2.42, which used + * "sha2". But "sha3" implies "sha2", so "sha3" still works in those + * versions.) "sha3" doesn't make a lot of sense, since SHA-512 is part + * of the SHA-2 family of algorithms, and also the Arm Architecture + * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately. + * Regardless, we must use "sha3" to be compatible with the assemblers. + */ + .arch armv8-a+sha3 /* * The SHA-512 round constants -- cgit v1.2.3